diff --git a/ci/pipeline/determine_tests_to_run.py b/ci/pipeline/determine_tests_to_run.py index f84dbf0994f42..9349722471455 100644 --- a/ci/pipeline/determine_tests_to_run.py +++ b/ci/pipeline/determine_tests_to_run.py @@ -68,8 +68,6 @@ def get_commit_range(): # Whether all RLlib tests should be run. # Set to 1 only when a source file in `ray/rllib` has been changed. RAY_CI_RLLIB_DIRECTLY_AFFECTED = 0 - # Whether to run all RLlib contrib tests - RAY_CI_RLLIB_CONTRIB_AFFECTED = 0 RAY_CI_SERVE_AFFECTED = 0 RAY_CI_CORE_CPP_AFFECTED = 0 RAY_CI_CPP_AFFECTED = 0 @@ -167,13 +165,6 @@ def get_commit_range(): RAY_CI_RLLIB_DIRECTLY_AFFECTED = 1 RAY_CI_LINUX_WHEELS_AFFECTED = 1 RAY_CI_MACOS_WHEELS_AFFECTED = 1 - elif ( - re.match("rllib_contrib/", changed_file) - or changed_file == ".buildkite/rllib_contrib.rayci.yml" - or changed_file == ".buildkite/pipeline.ml.yml" - ): - if not changed_file.endswith(".md"): - RAY_CI_RLLIB_CONTRIB_AFFECTED = 1 elif ( changed_file.startswith("python/ray/serve") or changed_file == ".buildkite/serve.rayci.yml" @@ -380,8 +371,6 @@ def get_commit_range(): RAY_CI_TRAIN_AFFECTED = 1 RAY_CI_RLLIB_AFFECTED = 1 RAY_CI_RLLIB_DIRECTLY_AFFECTED = 1 - # the rllib contrib ci should only be run on pull requests - RAY_CI_RLLIB_CONTRIB_AFFECTED = 0 RAY_CI_SERVE_AFFECTED = 1 RAY_CI_CPP_AFFECTED = 1 RAY_CI_CORE_CPP_AFFECTED = 1 @@ -408,7 +397,6 @@ def get_commit_range(): "RAY_CI_RLLIB_AFFECTED={}".format(RAY_CI_RLLIB_AFFECTED), "RAY_CI_RLLIB_GPU_AFFECTED={}".format(RAY_CI_RLLIB_GPU_AFFECTED), "RAY_CI_RLLIB_DIRECTLY_AFFECTED={}".format(RAY_CI_RLLIB_DIRECTLY_AFFECTED), - "RAY_CI_RLLIB_CONTRIB_AFFECTED={}".format(RAY_CI_RLLIB_CONTRIB_AFFECTED), "RAY_CI_SERVE_AFFECTED={}".format(RAY_CI_SERVE_AFFECTED), "RAY_CI_DASHBOARD_AFFECTED={}".format(RAY_CI_DASHBOARD_AFFECTED), "RAY_CI_DOC_AFFECTED={}".format(RAY_CI_DOC_AFFECTED), diff --git a/rllib/utils/deprecation.py b/rllib/utils/deprecation.py index ae5fb8de14920..354a412b262ee 100644 --- a/rllib/utils/deprecation.py +++ b/rllib/utils/deprecation.py @@ -132,11 +132,3 @@ def _ctor(*args, **kwargs): # Return the prepared decorator. return _inner - - -ALGO_DEPRECATION_WARNING = ( - "This algorithm will be removed by ray 2.9 It is being " - "moved to the ray/rllib_contrib dir. See " - "https://github.com/ray-project/enhancements/blob/main/reps/2023-04-28-remove-algorithms-from-rllib.md " # noqa: E501 - "for more details. Any associated components (e.g. policies) will also be moved." -) diff --git a/rllib_contrib/README.md b/rllib_contrib/README.md deleted file mode 100644 index 0397e45021eab..0000000000000 --- a/rllib_contrib/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# RLlib-Contrib - -RLlib-Contrib is a directory for more experimental community contributions to RLlib including contributed algorithms. **This directory has a more relaxed bar for contributions than Ray or RLlib.** If you are interested in contributing to RLlib-Contrib, please see the [contributing guide](contributing.md). - -## Getting Started and Installation -Navigate to the algorithm sub-directory you are interested in and see the README.md for installation instructions and example scripts to help you get started! - -## List of Algorithms and Examples -Go to [List of examples and algorithms](TOC.md) to checkout the examples that our open source contributors have created with RLlib. - - -## Maintenance - -**Any issues that are filed in `rllib_contrib` will be solved best-effort by the community and there is no expectation of maintenance by the RLlib team.** - -**The API surface between algorithms in `rllib_contrib` and current versions of Ray / RLlib is not guaranteed. This means that any APIs that are used in rllib_contrib could potentially become modified/removed in newer version of Ray/RLlib.** - -We will generally accept contributions to this directory that meet any of the following criteria: - -1. Updating dependencies. -2. Submitting community contributed algorithms that have been tested and are ready for use. -3. Enabling algorithms to be run in different environments (ex. adding support for a new type of gymnasium environment). -4. Updating algorithms for use with the newer RLlib APIs. -5. General bug fixes. - -We will not accept contributions that generally add a significant maintenance burden. In this case users should instead make their own repo with their contribution, using the same guidelines as this directory, and the RLlib team can help to market/promote it in the Ray docs. - -## Getting Involved - -| Platform | Purpose | Support Level | -| --- | --- | --- | -| [Discuss Forum](https://discuss.ray.io) | For discussions about development and questions about usage. | Community | -| [GitHub Issues](https://github.com/ray-project/ray/issues) | For reporting bugs and filing feature requests. | Community | -| [Slack](https://forms.gle/9TSdDYUgxYs8SA9e8) | For collaborating with other Ray users. | Community | diff --git a/rllib_contrib/TOC.md b/rllib_contrib/TOC.md deleted file mode 100644 index ca5a44d905ed1..0000000000000 --- a/rllib_contrib/TOC.md +++ /dev/null @@ -1,30 +0,0 @@ -# Algorithms - - -* [A2C](./a2c) -* [A3C](./a3c) -* [AlphaStar](./alpha_star) -* [AlphaZero](./alpha_zero) -* [APEX-DDPG][./apex_ddpg] -* [APEX DQN](./apex_dqn/) -* [ARS](./ars) -* [Bandit](./bandit) -* [CRR](./crr) -* [Decision Transformer](./dt) -* [DDPG](./ddpg) -* [ES](./es) -* [LeelaChessZero](./leela_chess_zero) -* [MAML](./maml) -* [MBMPO](./mbmpo) -* [PG](./pg) -* [QMIX](./qmix) -* [R2D2](./r2d2) -* [SimpleQ](./simple_q) -* [SlateQ](./slate_q) -* [TD3](./td3) - - - -# Example Use-cases - -* [Using TF-GNN for encoding graph spaces in RLlib using Tensorflow](https://github.com/kk-55/tf-gnn-example-for-rllib) \ No newline at end of file diff --git a/rllib_contrib/a2c/BUILD b/rllib_contrib/a2c/BUILD deleted file mode 100644 index 0fd28136e2fb9..0000000000000 --- a/rllib_contrib/a2c/BUILD +++ /dev/null @@ -1,51 +0,0 @@ -# Examples - -py_test( - name = "example_a2c_cartpole_v1", - main = "a2c_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/a2c_cartpole_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_cartpole_a2c", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-a2c.yaml"], - args = ["--dir=a2c/tuned_examples/"] -) - -py_test( - name = "learning_tests_cartpole_a2c_microbatch", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-a2c-microbatch.yaml"], - args = ["--dir=a2c/tuned_examples/"] -) - -py_test( - name = "learning_tests_cartpole_a2c_fake_gpus", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "no_tf_eager_tracing"], - size = "medium", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-a2c-fake-gpus.yaml"], - args = ["--dir=a2c/tuned_examples/"] -) - -# Compilation Tests - -py_test( - name = "test_a2c", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_a2c.py"] -) diff --git a/rllib_contrib/a2c/README.md b/rllib_contrib/a2c/README.md deleted file mode 100644 index e18340a4c9154..0000000000000 --- a/rllib_contrib/a2c/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# A2C (Advantage Actor-Critic) - -[A2C](https://arxiv.org/abs/1602.01783) is a synchronous, deterministic version of A3C; that’s why it is named as “A2C” with the first “A” (“asynchronous”) removed. In A3C each agent talks to the global parameters independently, so it is possible sometimes the thread-specific agents would be playing with policies of different versions and therefore the aggregated update would not be optimal. To resolve the inconsistency, a coordinator in A2C waits for all the parallel actors to finish their work before updating the global parameters and then in the next iteration parallel actors starts from the same policy. The synchronized gradient update keeps the training more cohesive and potentially to make convergence faster. - - -## Installation - -``` -conda create -n rllib-a2c python=3.10 -conda activate rllib-a2c -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[A3C Example](examples/a2c_cartpole_v1.py) \ No newline at end of file diff --git a/rllib_contrib/a2c/examples/a2c_cartpole_v1.py b/rllib_contrib/a2c/examples/a2c_cartpole_v1.py deleted file mode 100644 index 7aec7c4ac6c46..0000000000000 --- a/rllib_contrib/a2c/examples/a2c_cartpole_v1.py +++ /dev/null @@ -1,48 +0,0 @@ -import argparse - -from rllib_a2c.a2c import A2C, A2CConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - A2CConfig() - .rollouts(num_rollout_workers=0) - .framework("torch") - .environment("CartPole-v1") - .training(lr=0.001, train_batch_size=40) - ) - - stop_reward = 150 - - tuner = tune.Tuner( - A2C, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 500000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/a2c/pyproject.toml b/rllib_contrib/a2c/pyproject.toml deleted file mode 100644 index 91944981a2f26..0000000000000 --- a/rllib_contrib/a2c/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-a2c" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gym[accept-rom-license]", "gymnasium[accept-rom-license, atari]==0.26.3", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] - development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/a2c/requirements.txt b/rllib_contrib/a2c/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/a2c/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/a2c/src/rllib_a2c/a2c/__init__.py b/rllib_contrib/a2c/src/rllib_a2c/a2c/__init__.py deleted file mode 100644 index c2fb0796d286e..0000000000000 --- a/rllib_contrib/a2c/src/rllib_a2c/a2c/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from rllib_a2c.a2c.a2c import A2C, A2CConfig - -from ray.tune.registry import register_trainable - -__all__ = ["A2CConfig", "A2C"] - -register_trainable("rllib-contrib-a2c", A2C) diff --git a/rllib_contrib/a2c/src/rllib_a2c/a2c/a2c.py b/rllib_contrib/a2c/src/rllib_a2c/a2c/a2c.py deleted file mode 100644 index cd2f314ae14c9..0000000000000 --- a/rllib_contrib/a2c/src/rllib_a2c/a2c/a2c.py +++ /dev/null @@ -1,240 +0,0 @@ -import logging -import math -from typing import Optional - -from ray.rllib.algorithms.a3c.a3c import A3C, A3CConfig -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.annotations import override -from ray.rllib.utils.metrics import ( - APPLY_GRADS_TIMER, - COMPUTE_GRADS_TIMER, - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, - SAMPLE_TIMER, - SYNCH_WORKER_WEIGHTS_TIMER, -) -from ray.rllib.utils.typing import ResultDict - -logger = logging.getLogger(__name__) - - -class A2CConfig(A3CConfig): - """Defines a configuration class from which a new A2C can be built. - - Example: - >>> from ray import tune - >>> from ray.rllib.algorithms.a2c import A2CConfig - >>> config = A2CConfig() - >>> config = config.training(lr=0.01, grad_clip=30.0) # doctest: +SKIP - >>> config = config.resources(num_gpus=0) # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=2) # doctest: +SKIP - >>> config = config.environment("CartPole-v1") # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build() # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> import ray.air as air - >>> from ray import tune - >>> from ray.rllib.algorithms.a2c import A2CConfig - >>> config = A2CConfig() - >>> # Print out some default values. - >>> print(config.sample_async) # doctest: +SKIP - >>> # Update the config object. - >>> config = config.training(lr=tune.grid_search( # doctest: +SKIP - ... [0.001, 0.0001]), use_critic=False) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "A2C", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self): - """Initializes a A2CConfig instance.""" - super().__init__(algo_class=A2C) - - # fmt: off - # __sphinx_doc_begin__ - - # A2C specific settings: - self.microbatch_size = None - - # Override some of A3CConfig's default values with A2C-specific values. - self.num_rollout_workers = 2 - self.rollout_fragment_length = "auto" - self.sample_async = False - self.min_time_s_per_iteration = 10 - # __sphinx_doc_end__ - # fmt: on - - @override(A3CConfig) - def training( - self, - *, - microbatch_size: Optional[int] = NotProvided, - **kwargs, - ) -> "A2CConfig": - """Sets the training related configuration. - - Args: - microbatch_size: A2C supports microbatching, in which we accumulate - gradients over batch of this size until the train batch size is reached. - This allows training with batch sizes much larger than can fit in GPU - memory. To enable, set this to a value less than the train batch size. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if microbatch_size is not NotProvided: - self.microbatch_size = microbatch_size - - return self - - @override(A3CConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - # Synchronous sampling, on-policy PG algo -> Check mismatches between - # `rollout_fragment_length` and `train_batch_size` to avoid user confusion. - self.validate_train_batch_size_vs_rollout_fragment_length() - - if self.microbatch_size: - if self.num_gpus > 1: - raise AttributeError( - "A2C does not support multiple GPUs when micro-batching is set." - ) - - # Train batch size needs to be significantly larger than microbatch - # size. - if self.train_batch_size / self.microbatch_size < 3: - logger.warning( - "`train_batch_size` should be considerably larger (at least 3x)" - " than `microbatch_size` for a microbatching setup to make " - "sense!" - ) - # Rollout fragment length needs to be less than microbatch_size. - if ( - self.rollout_fragment_length != "auto" - and self.rollout_fragment_length > self.microbatch_size - ): - logger.warning( - "`rollout_fragment_length` should not be larger than " - "`microbatch_size` (try setting them to the same value)! " - "Otherwise, microbatches of desired size won't be achievable." - ) - - def get_rollout_fragment_length(self, worker_index: int = 0) -> int: - if self.rollout_fragment_length == "auto": - if self.microbatch_size: - return self.microbatch_size - return super().get_rollout_fragment_length(worker_index) - - -class A2C(A3C): - @classmethod - @override(A3C) - def get_default_config(cls) -> AlgorithmConfig: - return A2CConfig() - - @override(Algorithm) - def setup(self, config: AlgorithmConfig): - super().setup(config) - - # Create a microbatch variable for collecting gradients on microbatches'. - # These gradients will be accumulated on-the-fly and applied at once (once train - # batch size has been collected) to the model. - if self.config.microbatch_size: - self._microbatches_grads = None - self._microbatches_counts = self._num_microbatches = 0 - - @override(A3C) - def training_step(self) -> ResultDict: - # Fallback to Algorithm.training_step() and A3C policies (loss_fn etc). - # W/o microbatching: Identical to Algorithm's default implementation. - # Only difference to a default Algorithm being the value function loss term - # and its value computations alongside each action. - if self.config.microbatch_size is None: - return Algorithm.training_step(self) - - # In microbatch mode, we want to compute gradients on experience - # microbatches, average a number of these microbatches, and then - # apply the averaged gradient in one SGD step. This conserves GPU - # memory, allowing for extremely large experience batches to be - # used. - with self._timers[SAMPLE_TIMER]: - if self.config.count_steps_by == "agent_steps": - train_batch = synchronous_parallel_sample( - worker_set=self.workers, max_agent_steps=self.config.microbatch_size - ) - else: - train_batch = synchronous_parallel_sample( - worker_set=self.workers, max_env_steps=self.config.microbatch_size - ) - - self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps() - self._counters[NUM_AGENT_STEPS_SAMPLED] += train_batch.agent_steps() - - with self._timers[COMPUTE_GRADS_TIMER]: - grad, info = self.workers.local_worker().compute_gradients( - train_batch, single_agent=True - ) - # New microbatch accumulation phase. - if self._microbatches_grads is None: - self._microbatches_grads = grad - # Existing gradients: Accumulate new gradients on top of existing ones. - else: - for i, g in enumerate(grad): - self._microbatches_grads[i] += g - self._microbatches_counts += train_batch.count - self._num_microbatches += 1 - - # If `train_batch_size` reached: Accumulate gradients and apply. - num_microbatches = math.ceil( - self.config.train_batch_size / self.config.microbatch_size - ) - if self._num_microbatches >= num_microbatches: - # Update counters. - self._counters[NUM_ENV_STEPS_TRAINED] += self._microbatches_counts - self._counters[NUM_AGENT_STEPS_TRAINED] += self._microbatches_counts - - # Apply gradients. - apply_timer = self._timers[APPLY_GRADS_TIMER] - with apply_timer: - self.workers.local_worker().apply_gradients(self._microbatches_grads) - apply_timer.push_units_processed(self._microbatches_counts) - - # Reset microbatch information. - self._microbatches_grads = None - self._microbatches_counts = self._num_microbatches = 0 - - # Also update global vars of the local worker. - # Create current global vars. - global_vars = { - "timestep": self._counters[NUM_AGENT_STEPS_SAMPLED], - } - # Synch updated weights back to the workers - # (only those policies that are trainable). - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - self.workers.sync_weights( - policies=self.workers.local_worker().get_policies_to_train(), - global_vars=global_vars, - ) - - train_results = {DEFAULT_POLICY_ID: info} - - return train_results diff --git a/rllib_contrib/a2c/tests/test_a2c.py b/rllib_contrib/a2c/tests/test_a2c.py deleted file mode 100644 index af30b15dc808e..0000000000000 --- a/rllib_contrib/a2c/tests/test_a2c.py +++ /dev/null @@ -1,77 +0,0 @@ -import unittest - -from rllib_a2c.a2c import A2CConfig - -import ray -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) - - -class TestA2C(unittest.TestCase): - """Sanity tests for A2C exec impl.""" - - def setUp(self): - ray.init(num_cpus=4) - - def tearDown(self): - ray.shutdown() - - def test_a2c_compilation(self): - """Test whether an A2C can be built with both frameworks.""" - config = A2CConfig().rollouts(num_rollout_workers=2, num_envs_per_worker=2) - - num_iterations = 1 - - # Test against all frameworks. - for _ in framework_iterator(config, with_eager_tracing=True): - for env in ["ALE/Pong-v5", "CartPole-v1", "Pendulum-v1"]: - config.environment(env) - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - algo.stop() - - def test_a2c_exec_impl(self): - config = ( - A2CConfig() - .environment(env="CartPole-v1") - .reporting(min_time_s_per_iteration=0) - ) - - for _ in framework_iterator(config): - algo = config.build() - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - algo.stop() - - def test_a2c_exec_impl_microbatch(self): - config = ( - A2CConfig() - .environment(env="CartPole-v1") - .reporting(min_time_s_per_iteration=0) - .training(microbatch_size=10) - ) - - for _ in framework_iterator(config): - algo = config.build() - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/a2c/tuned_examples/__init__.py b/rllib_contrib/a2c/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/a2c/tuned_examples/atari-a2c.yaml b/rllib_contrib/a2c/tuned_examples/atari-a2c.yaml deleted file mode 100644 index 4c3f521e18b2f..0000000000000 --- a/rllib_contrib/a2c/tuned_examples/atari-a2c.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# Runs on a single g3.16xl node -# See https://github.com/ray-project/rl-experiments for results -atari-a2c: - env: - grid_search: - - ALE/Breakout-v5 - - ALE/BeamRider-v5 - - ALE/Qbert-v5 - - ALE/SpaceInvaders-v5 - run: A2C - config: - # Works for both torch and tf. - framework: torch - # Make analogous to old v4 + NoFrameskip. - env_config: - frameskip: 1 - full_action_space: false - repeat_action_probability: 0.0 - train_batch_size: 500 - rollout_fragment_length: auto - clip_rewards: True - num_workers: 5 - num_envs_per_worker: 5 - num_gpus: 1 - lr_schedule: [ - [0, 0.0007], - [20000000, 0.000000000001], - ] diff --git a/rllib_contrib/a2c/tuned_examples/cartpole-a2c-fake-gpus.yaml b/rllib_contrib/a2c/tuned_examples/cartpole-a2c-fake-gpus.yaml deleted file mode 100644 index 9e79cb489c144..0000000000000 --- a/rllib_contrib/a2c/tuned_examples/cartpole-a2c-fake-gpus.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-a2c-fake-gpus: - env: CartPole-v1 - run: A2C - stop: - sampler_results/episode_reward_mean: 150 - training_iteration: 200 - config: - # Works for both torch and tf. - framework: torch - train_batch_size: 20 - rollout_fragment_length: auto - num_workers: 0 - lr: 0.001 - # Fake 2 GPUs. - num_gpus: 2 - _fake_gpus: true diff --git a/rllib_contrib/a2c/tuned_examples/cartpole-a2c-microbatch.yaml b/rllib_contrib/a2c/tuned_examples/cartpole-a2c-microbatch.yaml deleted file mode 100644 index f6486bd9f3bcd..0000000000000 --- a/rllib_contrib/a2c/tuned_examples/cartpole-a2c-microbatch.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-a2c-microbatch: - env: CartPole-v1 - run: A2C - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 1000000 - config: - # Works for both torch and tf. - framework: torch - num_workers: 2 - gamma: 0.95 - rollout_fragment_length: 20 - microbatch_size: 40 - train_batch_size: 120 - # When using tf>=2.8, eager tracing can not be used - eager_tracing: False diff --git a/rllib_contrib/a2c/tuned_examples/cartpole-a2c.yaml b/rllib_contrib/a2c/tuned_examples/cartpole-a2c.yaml deleted file mode 100644 index 8ede986f80b40..0000000000000 --- a/rllib_contrib/a2c/tuned_examples/cartpole-a2c.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-a2c: - env: CartPole-v1 - run: A2C - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 500000 - config: - # Works for both torch and tf. - framework: torch - train_batch_size: 40 - rollout_fragment_length: auto - num_workers: 0 - lr: 0.001 diff --git a/rllib_contrib/a2c/tuned_examples/cartpole_a2c.py b/rllib_contrib/a2c/tuned_examples/cartpole_a2c.py deleted file mode 100644 index 216364e8de5e5..0000000000000 --- a/rllib_contrib/a2c/tuned_examples/cartpole_a2c.py +++ /dev/null @@ -1,12 +0,0 @@ -# Run with: -# rllib train file cartpole_a2c.py \ -# --stop={'timesteps_total': 50000, 'episode_reward_mean': 200}" -from ray.rllib.algorithms.a2c import A2CConfig - -config = ( - A2CConfig() - .environment("CartPole-v1") - .training(lr=0.001, train_batch_size=20) - .framework("tf") - .rollouts(num_rollout_workers=0) -) diff --git a/rllib_contrib/a3c/BUILD b/rllib_contrib/a3c/BUILD deleted file mode 100644 index 83a5aa5c339e2..0000000000000 --- a/rllib_contrib/a3c/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# Examples - -py_test( - name = "example_a3c_cartpole_v1", - main = "a3c_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/a3c_cartpole_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_cartpole_a3c", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-a3c.yaml"], - args = ["--dir=a3c/tuned_examples/"] -) - -# Compilation Tests - -py_test( - name = "test_a3c", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_a3c.py"] -) diff --git a/rllib_contrib/a3c/README.md b/rllib_contrib/a3c/README.md deleted file mode 100644 index 897ebfea96b39..0000000000000 --- a/rllib_contrib/a3c/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# A3C (Asynchronous Advantage Actor-Critic) - -[A3C](https://arxiv.org/abs/1602.01783) is the asynchronous version of A2C, where gradients are computed on the workers directly after trajectory rollouts, and only then shipped to a central learner to accumulate these gradients on the central model. After the central model update, parameters are broadcast back to all workers. Similar to A2C, A3C scales to 16-32+ worker processes depending on the environment. - - -## Installation - -``` -conda create -n rllib-a3c python=3.10 -conda activate rllib-a3c -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[A3C Example](examples/a3c_cartpole_v1.py) \ No newline at end of file diff --git a/rllib_contrib/a3c/examples/a3c_cartpole_v1.py b/rllib_contrib/a3c/examples/a3c_cartpole_v1.py deleted file mode 100644 index 8e01281e5584c..0000000000000 --- a/rllib_contrib/a3c/examples/a3c_cartpole_v1.py +++ /dev/null @@ -1,50 +0,0 @@ -import argparse - -from rllib_a3c.a3c import A3C, A3CConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - A3CConfig() - .rollouts(num_rollout_workers=1) - .framework("torch") - .environment("CartPole-v1") - .training( - gamma=0.95, - ) - ) - - stop_reward = 150 - - tuner = tune.Tuner( - A3C, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 200000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/a3c/pyproject.toml b/rllib_contrib/a3c/pyproject.toml deleted file mode 100644 index 60aefa393869c..0000000000000 --- a/rllib_contrib/a3c/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-a3c" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gym[accept-rom-license]", "gymnasium[mujoco]==0.26.3", "ray[rllib]==2.3.1"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/a3c/requirements.txt b/rllib_contrib/a3c/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/a3c/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/a3c/src/rllib_a3c/a3c/__init__.py b/rllib_contrib/a3c/src/rllib_a3c/a3c/__init__.py deleted file mode 100644 index 3b050de0dca52..0000000000000 --- a/rllib_contrib/a3c/src/rllib_a3c/a3c/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from rllib_a3c.a3c.a3c import A3C, A3CConfig - -from ray.tune.registry import register_trainable - -__all__ = ["A3CConfig", "A3C"] - -register_trainable("rllib-contrib-a3c", A3C) diff --git a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c.py b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c.py deleted file mode 100644 index 7f5a661cb94d2..0000000000000 --- a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c.py +++ /dev/null @@ -1,261 +0,0 @@ -import logging -from typing import Any, Dict, List, Optional, Type, Union - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.evaluation.rollout_worker import RolloutWorker -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.metrics import ( - APPLY_GRADS_TIMER, - GRAD_WAIT_TIMER, - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, - SYNCH_WORKER_WEIGHTS_TIMER, -) -from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder -from ray.rllib.utils.typing import ResultDict - -logger = logging.getLogger(__name__) - - -class A3CConfig(AlgorithmConfig): - """Defines a configuration class from which a A3C Algorithm can be built. - - Example: - >>> from ray import tune - >>> from ray.rllib.algorithms.a3c import A3CConfig - >>> config = A3CConfig() # doctest: +SKIP - >>> config = config.training(lr=0.01, grad_clip=30.0) # doctest: +SKIP - >>> config = config.resources(num_gpus=0) # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=4) # doctest: +SKIP - >>> config = config.environment("CartPole-v1") # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build() # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.a3c import A3CConfig - >>> config = A3CConfig() - >>> # Print out some default values. - >>> print(config.sample_async) # doctest: +SKIP - >>> # Update the config object. - >>> config = config.training( # doctest: +SKIP - ... lr=tune.grid_search([0.001, 0.0001]), use_critic=False) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "A3C", - ... stop={"episode_reward_mean": 200}, - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a A3CConfig instance.""" - super().__init__(algo_class=algo_class or A3C) - - # fmt: off - # __sphinx_doc_begin__ - # - # A3C specific settings. - self.use_critic = True - self.use_gae = True - self.lambda_ = 1.0 - self.grad_clip = 40.0 - self.lr_schedule = None - self.vf_loss_coeff = 0.5 - self.entropy_coeff = 0.01 - self.entropy_coeff_schedule = None - self.sample_async = True - - # Override some of AlgorithmConfig's default values with PPO-specific values. - self.num_rollout_workers = 2 - self.rollout_fragment_length = 10 - self.lr = 0.0001 - # Min time (in seconds) per reporting. - # This causes not every call to `training_iteration` to be reported, - # but to wait until n seconds have passed and then to summarize the - # thus far collected results. - self.min_time_s_per_iteration = 5 - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # __sphinx_doc_end__ - # fmt: on - - @override(AlgorithmConfig) - def training( - self, - *, - lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, - use_critic: Optional[bool] = NotProvided, - use_gae: Optional[bool] = NotProvided, - lambda_: Optional[float] = NotProvided, - grad_clip: Optional[float] = NotProvided, - vf_loss_coeff: Optional[float] = NotProvided, - entropy_coeff: Optional[float] = NotProvided, - entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, - sample_async: Optional[bool] = NotProvided, - **kwargs, - ) -> "A3CConfig": - """Sets the training related configuration. - - Args: - lr_schedule: Learning rate schedule. In the format of - [[timestep, lr-value], [timestep, lr-value], ...] - Intermediary timesteps will be assigned to interpolated learning rate - values. A schedule should normally start from timestep 0. - use_critic: Should use a critic as a baseline (otherwise don't use value - baseline; required for using GAE). - use_gae: If true, use the Generalized Advantage Estimator (GAE) - with a value function, see https://arxiv.org/pdf/1506.02438.pdf. - lambda_: GAE(gamma) parameter. - grad_clip: Max global norm for each gradient calculated by worker. - vf_loss_coeff: Value Function Loss coefficient. - entropy_coeff: Coefficient of the entropy regularizer. - entropy_coeff_schedule: Decay schedule for the entropy regularizer. - sample_async: Whether workers should sample async. Note that this - increases the effective rollout_fragment_length by up to 5x due - to async buffering of batches. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if lr_schedule is not NotProvided: - self.lr_schedule = lr_schedule - if use_critic is not NotProvided: - self.lr_schedule = use_critic - if use_gae is not NotProvided: - self.use_gae = use_gae - if lambda_ is not NotProvided: - self.lambda_ = lambda_ - if grad_clip is not NotProvided: - self.grad_clip = grad_clip - if vf_loss_coeff is not NotProvided: - self.vf_loss_coeff = vf_loss_coeff - if entropy_coeff is not NotProvided: - self.entropy_coeff = entropy_coeff - if entropy_coeff_schedule is not NotProvided: - self.entropy_coeff_schedule = entropy_coeff_schedule - if sample_async is not NotProvided: - self.sample_async = sample_async - - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if self.entropy_coeff < 0: - raise ValueError("`entropy_coeff` must be >= 0.0!") - if self.num_rollout_workers <= 0 and self.sample_async: - raise ValueError("`num_workers` for A3C must be >= 1!") - - -class A3C(Algorithm): - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return A3CConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - from ray.rllib.algorithms.a3c.a3c_torch_policy import A3CTorchPolicy - - return A3CTorchPolicy - elif config["framework"] == "tf": - from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTF1Policy - - return A3CTF1Policy - else: - from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTF2Policy - - return A3CTF2Policy - - def training_step(self) -> ResultDict: - # Shortcut. - local_worker = self.workers.local_worker() - - # Define the function executed in parallel by all RolloutWorkers to collect - # samples + compute and return gradients (and other information). - - def sample_and_compute_grads(worker: RolloutWorker) -> Dict[str, Any]: - """Call sample() and compute_gradients() remotely on workers.""" - samples = worker.sample() - grads, infos = worker.compute_gradients(samples) - return { - "grads": grads, - "infos": infos, - "agent_steps": samples.agent_steps(), - "env_steps": samples.env_steps(), - } - - # Perform rollouts and gradient calculations asynchronously. - with self._timers[GRAD_WAIT_TIMER]: - # Results are a mapping from ActorHandle (RolloutWorker) to their - # returned gradient calculation results. - self.workers.foreach_worker_async( - func=sample_and_compute_grads, - healthy_only=True, - ) - async_results = self.workers.fetch_ready_async_reqs() - - # Loop through all fetched worker-computed gradients (if any) - # and apply them - one by one - to the local worker's model. - # After each apply step (one step per worker that returned some gradients), - # update that particular worker's weights. - global_vars = None - learner_info_builder = LearnerInfoBuilder(num_devices=1) - to_sync_workers = set() - for worker_id, result in async_results: - # Apply gradients to local worker. - with self._timers[APPLY_GRADS_TIMER]: - local_worker.apply_gradients(result["grads"]) - self._timers[APPLY_GRADS_TIMER].push_units_processed(result["agent_steps"]) - - # Update all step counters. - self._counters[NUM_AGENT_STEPS_SAMPLED] += result["agent_steps"] - self._counters[NUM_ENV_STEPS_SAMPLED] += result["env_steps"] - self._counters[NUM_AGENT_STEPS_TRAINED] += result["agent_steps"] - self._counters[NUM_ENV_STEPS_TRAINED] += result["env_steps"] - - learner_info_builder.add_learn_on_batch_results_multi_agent(result["infos"]) - - # Create current global vars. - global_vars = { - "timestep": self._counters[NUM_AGENT_STEPS_SAMPLED], - } - - # Add this worker to be synced. - to_sync_workers.add(worker_id) - - # Synch updated weights back to the particular worker - # (only those policies that are trainable). - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - self.workers.sync_weights( - policies=local_worker.get_policies_to_train(), - to_worker_indices=list(to_sync_workers), - global_vars=global_vars, - ) - - return learner_info_builder.finalize() diff --git a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_tf_policy.py b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_tf_policy.py deleted file mode 100644 index bdc77f5790aeb..0000000000000 --- a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_tf_policy.py +++ /dev/null @@ -1,183 +0,0 @@ -"""Note: Keep in sync with changes to VTraceTFPolicy.""" -from typing import Dict, List, Optional, Type, Union - -from ray.rllib.evaluation.episode import Episode -from ray.rllib.evaluation.postprocessing import ( - Postprocessing, - compute_gae_for_sample_batch, -) -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_action_dist import TFActionDistribution -from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - EntropyCoeffSchedule, - LearningRateSchedule, - ValueNetworkMixin, - compute_gradients, -) -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import explained_variance -from ray.rllib.utils.typing import ( - AgentID, - LocalOptimizer, - ModelGradients, - TensorType, - TFPolicyV2Type, -) - -tf1, tf, tfv = try_import_tf() - - -# We need this builder function because we want to share the same -# custom logics between TF1 dynamic and TF2 eager policies. -def get_a3c_tf_policy(name: str, base: TFPolicyV2Type) -> TFPolicyV2Type: - """Construct a A3CTFPolicy inheriting either dynamic or eager base policies. - - Args: - base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. - - Returns: - A TF Policy to be used with MAML. - """ - - class A3CTFPolicy( - ValueNetworkMixin, LearningRateSchedule, EntropyCoeffSchedule, base - ): - def __init__( - self, - observation_space, - action_space, - config, - existing_model=None, - existing_inputs=None, - ): - # First thing first, enable eager execution if necessary. - base.enable_eager_execution_if_necessary() - - # Initialize base class. - base.__init__( - self, - observation_space, - action_space, - config, - existing_inputs=existing_inputs, - existing_model=existing_model, - ) - - ValueNetworkMixin.__init__(self, self.config) - LearningRateSchedule.__init__( - self, self.config["lr"], self.config["lr_schedule"] - ) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() - - @override(base) - def loss( - self, - model: Union[ModelV2, "tf.keras.Model"], - dist_class: Type[TFActionDistribution], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - model_out, _ = model(train_batch) - action_dist = dist_class(model_out, model) - if self.is_recurrent(): - max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) - valid_mask = tf.sequence_mask( - train_batch[SampleBatch.SEQ_LENS], max_seq_len - ) - valid_mask = tf.reshape(valid_mask, [-1]) - else: - valid_mask = tf.ones_like(train_batch[SampleBatch.REWARDS]) - - log_prob = action_dist.logp(train_batch[SampleBatch.ACTIONS]) - vf = model.value_function() - - # The "policy gradients" loss - self.pi_loss = -tf.reduce_sum( - tf.boolean_mask( - log_prob * train_batch[Postprocessing.ADVANTAGES], valid_mask - ) - ) - - delta = tf.boolean_mask( - vf - train_batch[Postprocessing.VALUE_TARGETS], valid_mask - ) - - # Compute a value function loss. - if self.config.get("use_critic", True): - self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta)) - # Ignore the value function. - else: - self.vf_loss = tf.constant(0.0) - - self.entropy_loss = tf.reduce_sum( - tf.boolean_mask(action_dist.entropy(), valid_mask) - ) - - self.total_loss = ( - self.pi_loss - + self.vf_loss * self.config["vf_loss_coeff"] - - self.entropy_loss * self.entropy_coeff - ) - - return self.total_loss - - @override(base) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), - "policy_loss": self.pi_loss, - "policy_entropy": self.entropy_loss, - "var_gnorm": tf.linalg.global_norm( - list(self.model.trainable_variables()) - ), - "vf_loss": self.vf_loss, - } - - @override(base) - def grad_stats_fn( - self, train_batch: SampleBatch, grads: ModelGradients - ) -> Dict[str, TensorType]: - return { - "grad_gnorm": tf.linalg.global_norm(grads), - "vf_explained_var": explained_variance( - train_batch[Postprocessing.VALUE_TARGETS], - self.model.value_function(), - ), - } - - @override(base) - def postprocess_trajectory( - self, - sample_batch: SampleBatch, - other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, - episode: Optional[Episode] = None, - ): - sample_batch = super().postprocess_trajectory(sample_batch) - return compute_gae_for_sample_batch( - self, sample_batch, other_agent_batches, episode - ) - - @override(base) - def compute_gradients_fn( - self, optimizer: LocalOptimizer, loss: TensorType - ) -> ModelGradients: - return compute_gradients(self, optimizer, loss) - - A3CTFPolicy.__name__ = name - A3CTFPolicy.__qualname__ = name - - return A3CTFPolicy - - -A3CTF1Policy = get_a3c_tf_policy("A3CTF1Policy", DynamicTFPolicyV2) -A3CTF2Policy = get_a3c_tf_policy("A3CTF2Policy", EagerTFPolicyV2) diff --git a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_torch_policy.py b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_torch_policy.py deleted file mode 100644 index e702254cd16c8..0000000000000 --- a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_torch_policy.py +++ /dev/null @@ -1,152 +0,0 @@ -from typing import Dict, List, Optional, Type, Union - -from ray.rllib.evaluation.episode import Episode -from ray.rllib.evaluation.postprocessing import ( - Postprocessing, - compute_gae_for_sample_batch, -) -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import ( - EntropyCoeffSchedule, - LearningRateSchedule, - ValueNetworkMixin, -) -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.torch_utils import apply_grad_clipping, sequence_mask -from ray.rllib.utils.typing import AgentID, TensorType - -torch, nn = try_import_torch() - - -class A3CTorchPolicy( - ValueNetworkMixin, LearningRateSchedule, EntropyCoeffSchedule, TorchPolicyV2 -): - """PyTorch Policy class used with A3C.""" - - def __init__(self, observation_space, action_space, config): - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - ValueNetworkMixin.__init__(self, config) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() - - @override(TorchPolicyV2) - def loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - """Constructs the loss function. - - Args: - model: The Model to calculate the loss for. - dist_class: The action distr. class. - train_batch: The training data. - - Returns: - The A3C loss tensor given the input batch. - """ - logits, _ = model(train_batch) - values = model.value_function() - - if self.is_recurrent(): - B = len(train_batch[SampleBatch.SEQ_LENS]) - max_seq_len = logits.shape[0] // B - mask_orig = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) - valid_mask = torch.reshape(mask_orig, [-1]) - else: - valid_mask = torch.ones_like(values, dtype=torch.bool) - - dist = dist_class(logits, model) - log_probs = dist.logp(train_batch[SampleBatch.ACTIONS]).reshape(-1) - pi_err = -torch.sum( - torch.masked_select( - log_probs * train_batch[Postprocessing.ADVANTAGES], valid_mask - ) - ) - - # Compute a value function loss. - if self.config["use_critic"]: - value_err = 0.5 * torch.sum( - torch.pow( - torch.masked_select( - values.reshape(-1) - train_batch[Postprocessing.VALUE_TARGETS], - valid_mask, - ), - 2.0, - ) - ) - # Ignore the value function. - else: - value_err = 0.0 - - entropy = torch.sum(torch.masked_select(dist.entropy(), valid_mask)) - - total_loss = ( - pi_err - + value_err * self.config["vf_loss_coeff"] - - entropy * self.entropy_coeff - ) - - # Store values for stats function in model (tower), such that for - # multi-GPU, we do not override them during the parallel loss phase. - model.tower_stats["entropy"] = entropy - model.tower_stats["pi_err"] = pi_err - model.tower_stats["value_err"] = value_err - - return total_loss - - @override(TorchPolicyV2) - def optimizer( - self, - ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]: - """Returns a torch optimizer (Adam) for A3C.""" - return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"]) - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return convert_to_numpy( - { - "cur_lr": self.cur_lr, - "entropy_coeff": self.entropy_coeff, - "policy_entropy": torch.mean( - torch.stack(self.get_tower_stats("entropy")) - ), - "policy_loss": torch.mean(torch.stack(self.get_tower_stats("pi_err"))), - "vf_loss": torch.mean(torch.stack(self.get_tower_stats("value_err"))), - } - ) - - @override(TorchPolicyV2) - def postprocess_trajectory( - self, - sample_batch: SampleBatch, - other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, - episode: Optional[Episode] = None, - ): - sample_batch = super().postprocess_trajectory(sample_batch) - return compute_gae_for_sample_batch( - self, sample_batch, other_agent_batches, episode - ) - - @override(TorchPolicyV2) - def extra_grad_process( - self, optimizer: "torch.optim.Optimizer", loss: TensorType - ) -> Dict[str, TensorType]: - return apply_grad_clipping(self, optimizer, loss) diff --git a/rllib_contrib/a3c/tests/test_a3c.py b/rllib_contrib/a3c/tests/test_a3c.py deleted file mode 100644 index 9420291c74f1d..0000000000000 --- a/rllib_contrib/a3c/tests/test_a3c.py +++ /dev/null @@ -1,102 +0,0 @@ -import unittest - -from rllib_a3c.a3c import A3CConfig - -import ray -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) - - -class TestA3C(unittest.TestCase): - """Sanity tests for A2C exec impl.""" - - def setUp(self): - ray.init(num_cpus=4) - - def tearDown(self): - ray.shutdown() - - def test_a3c_compilation(self): - """Test whether an A3C can be built with both frameworks.""" - config = A3CConfig().rollouts(num_rollout_workers=2, num_envs_per_worker=2) - - num_iterations = 2 - - # Test against all frameworks. - for _ in framework_iterator(config): - config.eager_tracing = False - for env in ["CartPole-v1", "Pendulum-v1"]: - print("env={}".format(env)) - config.model["use_lstm"] = env == "CartPole-v1" - algo = config.build(env=env) - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action( - algo, include_state=config.model["use_lstm"] - ) - algo.stop() - - def test_a3c_entropy_coeff_schedule(self): - """Test A3C entropy coeff schedule support.""" - config = A3CConfig().rollouts( - num_rollout_workers=1, - num_envs_per_worker=1, - batch_mode="truncate_episodes", - rollout_fragment_length=10, - ) - # Initial entropy coeff, doesn't really matter because of the schedule below. - config.training( - train_batch_size=20, - entropy_coeff=0.01, - entropy_coeff_schedule=[ - [0, 0.01], - [120, 0.0001], - ], - ) - # 0 metrics reporting delay, this makes sure timestep, - # which entropy coeff depends on, is updated after each worker rollout. - config.reporting( - min_time_s_per_iteration=0, min_sample_timesteps_per_iteration=20 - ) - - def _step_n_times(trainer, n: int): - """Step trainer n times. - - Returns: - learning rate at the end of the execution. - """ - for _ in range(n): - results = trainer.train() - return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ - "entropy_coeff" - ] - - # Test against all frameworks. - for _ in framework_iterator(config, frameworks=("torch", "tf")): - config.eager_tracing = False - algo = config.build(env="CartPole-v1") - - coeff = _step_n_times(algo, 1) # 20 timesteps - # Should be close to the starting coeff of 0.01 - self.assertGreaterEqual(coeff, 0.005) - - coeff = _step_n_times(algo, 10) # 200 timesteps - # Should have annealed to the final coeff of 0.0001. - self.assertLessEqual(coeff, 0.00011) - - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/a3c/tuned_examples/__init__.py b/rllib_contrib/a3c/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/a3c/tuned_examples/cartpole-a3c.yaml b/rllib_contrib/a3c/tuned_examples/cartpole-a3c.yaml deleted file mode 100644 index 6c182bcc411f4..0000000000000 --- a/rllib_contrib/a3c/tuned_examples/cartpole-a3c.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-a3c: - env: CartPole-v1 - run: A3C - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 200000 - config: - # Works for both torch and tf. - framework: torch - num_workers: 1 - gamma: 0.95 diff --git a/rllib_contrib/a3c/tuned_examples/cartpole_a3c.py b/rllib_contrib/a3c/tuned_examples/cartpole_a3c.py deleted file mode 100644 index 27fa9ee7c5367..0000000000000 --- a/rllib_contrib/a3c/tuned_examples/cartpole_a3c.py +++ /dev/null @@ -1,12 +0,0 @@ -# Run with: -# rllib train file cartpole_a3c.py \ -# --stop="{'timesteps_total': 20000, 'episode_reward_mean': 150}" -from ray.rllib.algorithms.a3c import A3CConfig - -config = ( - A3CConfig() - .training(gamma=0.95) - .environment("CartPole-v1") - .framework("tf") - .rollouts(num_rollout_workers=0) -) diff --git a/rllib_contrib/a3c/tuned_examples/memory-leak-test-a3c.yaml b/rllib_contrib/a3c/tuned_examples/memory-leak-test-a3c.yaml deleted file mode 100644 index 17dc128d7a545..0000000000000 --- a/rllib_contrib/a3c/tuned_examples/memory-leak-test-a3c.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -memory-leak-test-a3c: - stop: - timesteps_total: 150000 - env: - ray.rllib.examples.env.random_env.RandomLargeObsSpaceEnv - run: A3C - config: - # Works for both torch and tf. - framework: torch - # Switch off np.random, which is known to have memory leaks. - env_config: - config: - static_samples: true - num_workers: 4 - num_envs_per_worker: 5 diff --git a/rllib_contrib/a3c/tuned_examples/pong-a3c.yaml b/rllib_contrib/a3c/tuned_examples/pong-a3c.yaml deleted file mode 100644 index 99e70b1165308..0000000000000 --- a/rllib_contrib/a3c/tuned_examples/pong-a3c.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pong-a3c: - env: ALE/Pong-v5 - run: A3C - config: - # Works for both torch and tf. - framework: torch - # Make analogous to old v4 + NoFrameskip. - env_config: - frameskip: 1 - full_action_space: false - repeat_action_probability: 0.0 - num_workers: 16 - rollout_fragment_length: 20 - vf_loss_coeff: 0.5 - entropy_coeff: 0.01 - gamma: 0.99 - grad_clip: 40.0 - lambda: 1.0 - lr: 0.0001 - observation_filter: NoFilter - preprocessor_pref: rllib - model: - use_lstm: true - conv_activation: elu - dim: 42 - grayscale: true - zero_mean: false - # Reduced channel depth and kernel size from default - conv_filters: [ - [32, [3, 3], 2], - [32, [3, 3], 2], - [32, [3, 3], 2], - [32, [3, 3], 2], - ] diff --git a/rllib_contrib/alpha_star/BUILD b/rllib_contrib/alpha_star/BUILD deleted file mode 100644 index 487cb47e7cc30..0000000000000 --- a/rllib_contrib/alpha_star/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# Examples - -py_test( - name = "example_multi_agent_cartpole_alpha_star", - main = "multi-agent-cartpole-alpha-star.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/multi-agent-cartpole-alpha-star.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_cartpole_alpha_star", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib"], - size = "medium", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/multi-agent-cartpole-alpha-star.yaml"], - args = ["--dir=alpha_star/tuned_examples/", "--num-cpus=10"] -) - -# Compilation Tests - -py_test( - name = "test_alpha_star", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_alpha_star.py"] -) diff --git a/rllib_contrib/alpha_star/README.md b/rllib_contrib/alpha_star/README.md deleted file mode 100644 index f29be67ef3e6d..0000000000000 --- a/rllib_contrib/alpha_star/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Alpha Star (Asynchronous Advantage Actor-Critic) - -[Alpha Star](https://www.deepmind.com/blog/alphastar-grandmaster-level-in-starcraft-ii-using-multi-agent-reinforcement-learning) Implementation of Deepmind's Alpha Star using RLlib. - - -## Installation - -``` -conda create -n rllib-alpha-star python=3.10 -conda activate rllib-alpha-star -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[Alpha Star Example](examples/multi-agent-cartpole-alpha-star.py) \ No newline at end of file diff --git a/rllib_contrib/alpha_star/examples/multi-agent-cartpole-alpha-star.py b/rllib_contrib/alpha_star/examples/multi-agent-cartpole-alpha-star.py deleted file mode 100644 index d3d8604be23ea..0000000000000 --- a/rllib_contrib/alpha_star/examples/multi-agent-cartpole-alpha-star.py +++ /dev/null @@ -1,72 +0,0 @@ -import argparse - -from rllib_alpha_star.alpha_star import AlphaStar, AlphaStarConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - ray.init() - - config = ( - AlphaStarConfig() - .rollouts( - num_rollout_workers=2, - num_envs_per_worker=5, - observation_filter="MeanStdFilter", - ) - .resources(num_gpus=1, _fake_gpus=True) - .environment( - "ray.rllib.examples.env.multi_agent.MultiAgentCartPole", - env_config={"config": {"num_agents": 2}}, - ) - .training( - gamma=0.95, - num_sgd_iter=1, - vf_loss_coeff=0.005, - vtrace=True, - model={ - "fcnet_hiddens": [32], - "fcnet_activation": "linear", - "vf_share_layers": True, - }, - replay_buffer_replay_ratio=0.0, - league_builder_config={ - "type": "rllib_alpha_star.alpha_star.league_builder.NoLeagueBuilder" - }, - ) - .multi_agent( - policies=["p0", "p1"], - policy_mapping_fn={ - "type": "ray.rllib.examples.multi_agent_and_self_play.policy_mapping_fn.PolicyMappingFn" # noqa - }, - ) - .debugging(seed=0) - ) - - num_iterations = 100 - stop_reward = 300 - - tuner = tune.Tuner( - AlphaStar, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={"episode_reward_mean": stop_reward, "timesteps_total": 200000}, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/alpha_star/pyproject.toml b/rllib_contrib/alpha_star/pyproject.toml deleted file mode 100644 index c81781a9af0b6..0000000000000 --- a/rllib_contrib/alpha_star/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-alpha-star" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gym", "gymnasium==0.26.3", "ray[rllib]==2.5.0", "open-spiel==1.3"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/alpha_star/requirements.txt b/rllib_contrib/alpha_star/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/alpha_star/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/__init__.py b/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/__init__.py deleted file mode 100644 index 4a5679d83eb74..0000000000000 --- a/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from rllib_alpha_star.alpha_star.alpha_star import AlphaStar, AlphaStarConfig -from rllib_alpha_star.alpha_star.distributed_learners import DistributedLearners -from rllib_alpha_star.alpha_star.league_builder import ( - AlphaStarLeagueBuilder, - LeagueBuilder, - NoLeagueBuilder, -) - -from ray.tune.registry import register_trainable - -__all__ = [ - "AlphaStarConfig", - "AlphaStar", - "DistributedLearners", - "LeagueBuilder", - "AlphaStarLeagueBuilder", - "NoLeagueBuilder", -] - -register_trainable("rllib-contrib-alpha-star", AlphaStar) diff --git a/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/alpha_star.py b/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/alpha_star.py deleted file mode 100644 index 3850954e78e6d..0000000000000 --- a/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/alpha_star.py +++ /dev/null @@ -1,635 +0,0 @@ -""" -A multi-agent, distributed multi-GPU, league-capable asynch. PPO -================================================================ -""" -from typing import Any, Dict, Optional, Type, Union - -import gymnasium as gym -import tree -from rllib_alpha_star.alpha_star.distributed_learners import DistributedLearners -from rllib_alpha_star.alpha_star.league_builder import AlphaStarLeagueBuilder - -import ray -import ray.rllib.algorithms.appo.appo as appo -from ray.actor import ActorHandle -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.evaluation.rollout_worker import RolloutWorker -from ray.rllib.execution.buffers.mixin_replay_buffer import MixInMultiAgentReplayBuffer -from ray.rllib.execution.parallel_requests import AsyncRequestsManager -from ray.rllib.policy.policy import Policy, PolicySpec -from ray.rllib.policy.sample_batch import MultiAgentBatch -from ray.rllib.utils import deep_update -from ray.rllib.utils.annotations import override -from ray.rllib.utils.from_config import from_config -from ray.rllib.utils.metrics import ( - LAST_TARGET_UPDATE_TS, - LEARN_ON_BATCH_TIMER, - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_TARGET_UPDATES, - SAMPLE_TIMER, - SYNCH_WORKER_WEIGHTS_TIMER, - TARGET_NET_UPDATE_TIMER, -) -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.rllib.utils.typing import ( - PartialAlgorithmConfigDict, - PolicyID, - PolicyState, - ResultDict, -) -from ray.tune.execution.placement_groups import PlacementGroupFactory -from ray.util.timer import _Timer - - -class AlphaStarConfig(appo.APPOConfig): - """Defines a configuration class from which an AlphaStar Algorithm can be built. - - Example: - >>> from ray.rllib.algorithms.alpha_star import AlphaStarConfig - >>> config = AlphaStarConfig().training(lr=0.0003, train_batch_size=512)\ - ... .resources(num_gpus=4)\ - ... .rollouts(num_rollout_workers=64) - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.alpha_star import AlphaStarConfig - >>> from ray import air - >>> from ray import tune - >>> config = AlphaStarConfig() - >>> # Print out some default values. - >>> print(config.vtrace) # doctest: +SKIP - >>> # Update the config object. - >>> config.training(lr=tune.grid_search([0.0001, 0.0003]), grad_clip=20.0) - >>> # Set the config object's env. - >>> config.environment(env="CartPole-v1") - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "AlphaStar", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a AlphaStarConfig instance.""" - super().__init__(algo_class=algo_class or AlphaStar) - - # fmt: off - # __sphinx_doc_begin__ - - # AlphaStar specific settings: - self.replay_buffer_capacity = 20 - self.replay_buffer_replay_ratio = 0.5 - # Tuning max_requests_in_flight_per_sampler_worker and - # max_requests_in_flight_per_learner_worker is important so backpressure is - # created on the remote workers and the object store doesn't fill up - # unexpectedly. If the workers spend time idle, consider increasing these. - self.max_requests_in_flight_per_sampler_worker = 2 - self.max_requests_in_flight_per_learner_worker = 2 - - self.timeout_s_sampler_manager = 0.0 - self.timeout_s_learner_manager = 0.0 - - # League-building parameters. - # The LeagueBuilder class to be used for league building logic. - self.league_builder_config = { - # Specify the sub-class of the `LeagueBuilder` API to use. - "type": AlphaStarLeagueBuilder, - - # Any any number of constructor kwargs to pass to this class: - - # The number of random policies to add to the league. This must be an - # even number (including 0) as these will be evenly distributed - # amongst league- and main- exploiters. - "num_random_policies": 2, - # The number of initially learning league-exploiters to create. - "num_learning_league_exploiters": 4, - # The number of initially learning main-exploiters to create. - "num_learning_main_exploiters": 4, - # Minimum win-rate (between 0.0 = 0% and 1.0 = 100%) of any policy to - # be considered for snapshotting (cloning). The cloned copy may then - # be frozen (no further learning) or keep learning (independent of - # its ancestor policy). - # Set this to lower values to speed up league growth. - "win_rate_threshold_for_new_snapshot": 0.9, - # If we took a new snapshot of any given policy, what's the probability - # that this snapshot will continue to be trainable (rather than become - # frozen/non-trainable)? By default, only keep those policies trainable - # that have been trainable from the very beginning. - "keep_new_snapshot_training_prob": 0.0, - # Probabilities of different match-types: - # LE: Learning league_exploiter vs any. - # ME: Learning main exploiter vs any main. - # M: Main self-play (p=1.0 - LE - ME). - "prob_league_exploiter_match": 0.33, - "prob_main_exploiter_match": 0.33, - # Only for ME matches: Prob to play against learning - # main (vs a snapshot main). - "prob_main_exploiter_playing_against_learning_main": 0.5, - } - self.max_num_policies_to_train = None - - # Override some of APPOConfig's default values with AlphaStar-specific - # values. - self.min_time_s_per_iteration = 2 - self.policies = None - self.simple_optimizer = True - # __sphinx_doc_end__ - # fmt: on - - @override(appo.APPOConfig) - def training( - self, - *, - replay_buffer_capacity: Optional[int] = NotProvided, - replay_buffer_replay_ratio: Optional[float] = NotProvided, - max_requests_in_flight_per_sampler_worker: Optional[int] = NotProvided, - max_requests_in_flight_per_learner_worker: Optional[int] = NotProvided, - timeout_s_sampler_manager: Optional[float] = NotProvided, - timeout_s_learner_manager: Optional[float] = NotProvided, - league_builder_config: Optional[Dict[str, Any]] = NotProvided, - max_num_policies_to_train: Optional[int] = NotProvided, - **kwargs, - ) -> "AlphaStarConfig": - """Sets the training related configuration. - - Args: - replay_buffer_capacity: This is num batches held at any time for each - policy. - replay_buffer_replay_ratio: For example, ratio=0.2 -> 20% of samples in - each train batch are old (replayed) ones. - timeout_s_sampler_manager: Timeout to use for `ray.wait()` when waiting for - samplers to have placed new data into the buffers. If no samples are - ready within the timeout, the buffers used for mixin-sampling will - return only older samples. - timeout_s_learner_manager: Timeout to use for `ray.wait()` when waiting for - the policy learner actors to have performed an update and returned - learning stats. If no learner actors have produced any learning - results in the meantime, their learner-stats in the results will be - empty for that iteration. - max_requests_in_flight_per_sampler_worker: Maximum number of ray remote - calls that can be run in parallel for each sampler worker. This is - particularly important when dealing with many sampler workers or - sample batches that are large, and when could potentially fill up - the object store. - max_requests_in_flight_per_learner_worker: Maximum number of ray remote - calls that can be run in parallel for each learner worker. This is - important to tune when dealing with many learner workers so that the - object store doesn't fill up and so that learner actors don't become - backed up with too many requests that could become stale if not - attended to in a timely manner. - league_builder_config: League-building config dict. - The dict Must contain a `type` key indicating the LeagueBuilder class - to be used for league building logic. All other keys (that are not - `type`) will be used as constructor kwargs on the given class to - construct the LeagueBuilder instance. See the - `ray.rllib.algorithms.alpha_star.league_builder::AlphaStarLeagueBuilder` - (used by default by this algo) as an example. - max_num_policies_to_train: The maximum number of trainable policies for this - Algorithm. Each trainable policy will exist as a independent remote - actor, co-located with a replay buffer. This is besides its existence - inside the RolloutWorkers for training and evaluation. Set to None for - automatically inferring this value from the number of trainable - policies found in the `multiagent` config. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - # TODO: Unify the buffer API, then clean up our existing - # implementations of different buffers. - if replay_buffer_capacity is not NotProvided: - self.replay_buffer_capacity = replay_buffer_capacity - if replay_buffer_replay_ratio is not NotProvided: - self.replay_buffer_replay_ratio = replay_buffer_replay_ratio - if timeout_s_sampler_manager is not NotProvided: - self.timeout_s_sampler_manager = timeout_s_sampler_manager - if timeout_s_learner_manager is not NotProvided: - self.timeout_s_learner_manager = timeout_s_learner_manager - if league_builder_config is not NotProvided: - # Override entire `league_builder_config` if `type` key changes. - # Update, if `type` key remains the same or is not specified. - new_league_builder_config = deep_update( - {"league_builder_config": self.league_builder_config}, - {"league_builder_config": league_builder_config}, - False, - ["league_builder_config"], - ["league_builder_config"], - ) - self.league_builder_config = new_league_builder_config[ - "league_builder_config" - ] - if max_num_policies_to_train is not NotProvided: - self.max_num_policies_to_train = max_num_policies_to_train - if max_requests_in_flight_per_sampler_worker is not NotProvided: - self.max_requests_in_flight_per_sampler_worker = ( - max_requests_in_flight_per_sampler_worker - ) - if max_requests_in_flight_per_learner_worker is not NotProvided: - self.max_requests_in_flight_per_learner_worker = ( - max_requests_in_flight_per_learner_worker - ) - - return self - - -class AlphaStar(appo.APPO): - _allow_unknown_subkeys = appo.APPO._allow_unknown_subkeys + [ - "league_builder_config", - ] - _override_all_subkeys_if_type_changes = ( - appo.APPO._override_all_subkeys_if_type_changes - + [ - "league_builder_config", - ] - ) - - @classmethod - @override(Algorithm) - def default_resource_request( - cls, - config: Union[AlgorithmConfig, PartialAlgorithmConfigDict], - ): - if isinstance(config, AlgorithmConfig): - cf: AlphaStarConfig = config - else: - cf: AlphaStarConfig = cls.get_default_config().update_from_dict(config) - # Construct a dummy LeagueBuilder, such that it gets the opportunity to - # adjust the multiagent config, according to its setup, and we can then - # properly infer the resources to allocate. - from_config(cf.league_builder_config, algo=None, algo_config=cf) - - max_num_policies_to_train = cf.max_num_policies_to_train or len( - cf.policies_to_train or cf.policies - ) - num_learner_shards = min( - cf.num_gpus or max_num_policies_to_train, max_num_policies_to_train - ) - num_gpus_per_shard = cf.num_gpus / num_learner_shards - num_policies_per_shard = max_num_policies_to_train / num_learner_shards - - fake_gpus = cf._fake_gpus - - eval_config = cf.get_evaluation_config_object() - - # Return PlacementGroupFactory containing all needed resources - # (already properly defined as device bundles). - return PlacementGroupFactory( - bundles=[ - { - # Driver (no GPUs). - "CPU": cf.num_cpus_for_local_worker, - } - ] - + [ - { - # RolloutWorkers (no GPUs). - "CPU": cf.num_cpus_per_worker, - } - for _ in range(cf.num_workers) - ] - + [ - { - # Policy learners (and Replay buffer shards). - # 1 CPU for the replay buffer. - # 1 CPU (or fractional GPU) for each learning policy. - "CPU": 1 + (num_policies_per_shard if fake_gpus else 0), - "GPU": 0 if fake_gpus else num_gpus_per_shard, - } - for _ in range(num_learner_shards) - ] - + ( - [ - { - # Evaluation (remote) workers. - # Note: The local eval worker is located on the driver - # CPU or not even created iff >0 eval workers. - "CPU": eval_config.get( - "num_cpus_per_worker", cf.num_cpus_per_worker - ), - } - for _ in range(cf.evaluation_num_workers) - ] - if cf.evaluation_interval - else [] - ), - strategy=cf.placement_strategy, - ) - - @classmethod - @override(appo.APPO) - def get_default_config(cls) -> AlphaStarConfig: - return AlphaStarConfig() - - @override(appo.APPO) - def setup(self, config: AlphaStarConfig): - # Create the LeagueBuilder object, allowing it to build the multiagent - # config as well. - self.league_builder = from_config( - self.config.league_builder_config, algo=self, algo_config=self.config - ) - - # Call super's setup to validate config, create RolloutWorkers - # (train and eval), etc.. - super().setup(config) - - local_worker = self.workers.local_worker() - - # - Create n policy learner actors (@ray.remote-converted Policies) on - # one or more GPU nodes. - # - On each such node, also locate one replay buffer shard. - - # Single CPU replay shard (co-located with GPUs so we can place the - # policies on the same machine(s)). - num_gpus = 0.01 if (self.config.num_gpus and not self.config._fake_gpus) else 0 - ReplayActor = ray.remote( - num_cpus=1, - num_gpus=num_gpus, - )(MixInMultiAgentReplayBuffer) - - # Setup remote replay buffer shards and policy learner actors - # (located on any GPU machine in the cluster): - replay_actor_args = [ - self.config["replay_buffer_capacity"], - self.config["replay_buffer_replay_ratio"], - ] - - # Create a DistributedLearners utility object and set it up with - # the initial first n learnable policies (found in the config). - distributed_learners = DistributedLearners( - config=self.config, - # By default, set max_num_policies_to_train to the number of policy IDs - # provided in the multiagent config. - max_num_policies_to_train=( - self.config.max_num_policies_to_train - or len(self.workers.local_worker().get_policies_to_train()) - ), - replay_actor_class=ReplayActor, - replay_actor_args=replay_actor_args, - ) - policies, _ = self.config.get_multi_agent_setup( - spaces=local_worker.spaces, - default_policy_class=local_worker.default_policy_class, - ) - for pid, policy_spec in policies.items(): - if ( - local_worker.is_policy_to_train is None - or local_worker.is_policy_to_train(pid) - ): - distributed_learners.add_policy(pid, policy_spec) - - # Store distributed_learners on all RolloutWorkers - # so they know, to which replay shard to send samples to. - - def _set_policy_learners(worker): - worker._distributed_learners = distributed_learners - - ray.get( - [ - w.apply.remote(_set_policy_learners) - for w in self.workers.remote_workers() - ] - ) - - self.distributed_learners = distributed_learners - self._sampling_actor_manager = AsyncRequestsManager( - self.workers.remote_workers(), - max_remote_requests_in_flight_per_worker=self.config[ - "max_requests_in_flight_per_sampler_worker" - ], - ray_wait_timeout_s=self.config.timeout_s_sampler_manager, - ) - policy_actors = [policy_actor for _, policy_actor, _ in distributed_learners] - self._learner_worker_manager = AsyncRequestsManager( - workers=policy_actors, - max_remote_requests_in_flight_per_worker=self.config[ - "max_requests_in_flight_per_learner_worker" - ], - ray_wait_timeout_s=self.config.timeout_s_learner_manager, - ) - - @override(Algorithm) - def step(self) -> ResultDict: - # Perform a full step (including evaluation). - result = super().step() - - # Based on the (train + evaluate) results, perform a step of - # league building. - self.league_builder.build_league(result=result) - - return result - - @override(Algorithm) - def training_step(self) -> ResultDict: - # Trigger asynchronous rollouts on all RolloutWorkers. - # - Rollout results are sent directly to correct replay buffer - # shards, instead of here (to the driver). - with self._timers[SAMPLE_TIMER]: - # if there are no remote workers (e.g. num_workers=0) - if not self.workers.remote_workers(): - worker = self.workers.local_worker() - statistics = worker.apply(self._sample_and_send_to_buffer) - sample_results = {worker: [statistics]} - else: - self._sampling_actor_manager.call_on_all_available( - self._sample_and_send_to_buffer - ) - sample_results = self._sampling_actor_manager.get_ready() - # Update sample counters. - for sample_result in sample_results.values(): - for (env_steps, agent_steps) in sample_result: - self._counters[NUM_ENV_STEPS_SAMPLED] += env_steps - self._counters[NUM_AGENT_STEPS_SAMPLED] += agent_steps - - # Trigger asynchronous training update requests on all learning - # policies. - with self._timers[LEARN_ON_BATCH_TIMER]: - for pid, pol_actor, repl_actor in self.distributed_learners: - if pol_actor not in self._learner_worker_manager.workers: - self._learner_worker_manager.add_workers(pol_actor) - self._learner_worker_manager.call( - self._update_policy, actor=pol_actor, fn_args=[repl_actor, pid] - ) - train_results = self._learner_worker_manager.get_ready() - - # Update sample counters. - for train_result in train_results.values(): - for result in train_result: - if NUM_AGENT_STEPS_TRAINED in result: - self._counters[NUM_AGENT_STEPS_TRAINED] += result[ - NUM_AGENT_STEPS_TRAINED - ] - - # For those policies that have been updated in this iteration - # (not all policies may have undergone an updated as we are - # requesting updates asynchronously): - # - Gather train infos. - # - Update weights to those remote rollout workers that contain - # the respective policy. - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - train_infos = {} - policy_weights = {} - for pol_actor, policy_results in train_results.items(): - results_have_same_structure = True - for result1, result2 in zip(policy_results, policy_results[1:]): - try: - tree.assert_same_structure(result1, result2) - except (ValueError, TypeError): - results_have_same_structure = False - break - if len(policy_results) > 1 and results_have_same_structure: - policy_result = tree.map_structure( - lambda *_args: sum(_args) / len(policy_results), *policy_results - ) - else: - policy_result = policy_results[-1] - if policy_result: - pid = self.distributed_learners.get_policy_id(pol_actor) - train_infos[pid] = policy_result - policy_weights[pid] = pol_actor.get_weights.remote() - - policy_weights_ref = ray.put(policy_weights) - - global_vars = { - "timestep": self._counters[NUM_ENV_STEPS_SAMPLED], - "league_builder": self.league_builder.__getstate__(), - } - - for worker in self.workers.remote_workers(): - worker.set_weights.remote(policy_weights_ref, global_vars) - - return train_infos - - @override(Algorithm) - def add_policy( - self, - policy_id: PolicyID, - policy_cls: Type[Policy], - *, - observation_space: Optional[gym.spaces.Space] = None, - action_space: Optional[gym.spaces.Space] = None, - config: Optional[PartialAlgorithmConfigDict] = None, - policy_state: Optional[PolicyState] = None, - **kwargs, - ) -> Policy: - # Add the new policy to all our train- and eval RolloutWorkers - # (including the local worker). - new_policy = super().add_policy( - policy_id, - policy_cls, - observation_space=observation_space, - action_space=action_space, - config=config, - policy_state=policy_state, - **kwargs, - ) - - # Do we have to create a policy-learner actor from it as well? - if policy_id in kwargs.get("policies_to_train", []): - new_policy_actor = self.distributed_learners.add_policy( - policy_id, - PolicySpec( - policy_cls, - new_policy.observation_space, - new_policy.action_space, - self.config, - ), - ) - # Set state of new policy actor, if provided. - if policy_state is not None: - ray.get(new_policy_actor.set_state.remote(policy_state)) - - return new_policy - - @override(Algorithm) - def cleanup(self) -> None: - super().cleanup() - # Stop all policy- and replay actors. - self.distributed_learners.stop() - - @staticmethod - def _sample_and_send_to_buffer(worker: RolloutWorker): - # Generate a sample. - sample = worker.sample() - # Send the per-agent SampleBatches to the correct buffer(s), - # depending on which policies participated in the episode. - assert isinstance(sample, MultiAgentBatch) - for pid, batch in sample.policy_batches.items(): - # Don't send data, if policy is not trainable. - replay_actor, _ = worker._distributed_learners.get_replay_and_policy_actors( - pid - ) - if replay_actor is not None: - ma_batch = MultiAgentBatch({pid: batch}, batch.count) - replay_actor.add.remote(ma_batch) - # Return counts (env-steps, agent-steps). - return sample.count, sample.agent_steps() - - @staticmethod - def _update_policy(policy: Policy, replay_actor: ActorHandle, pid: PolicyID): - if not hasattr(policy, "_target_and_kl_stats"): - policy._target_and_kl_stats = { - LAST_TARGET_UPDATE_TS: 0, - NUM_TARGET_UPDATES: 0, - NUM_AGENT_STEPS_TRAINED: 0, - TARGET_NET_UPDATE_TIMER: _Timer(), - } - - train_results = policy.learn_on_batch_from_replay_buffer( - replay_actor=replay_actor, policy_id=pid - ) - - if not train_results: - return train_results - - # Update target net and KL. - with policy._target_and_kl_stats[TARGET_NET_UPDATE_TIMER]: - policy._target_and_kl_stats[NUM_AGENT_STEPS_TRAINED] += train_results[ - NUM_AGENT_STEPS_TRAINED - ] - target_update_freq = ( - policy.config["num_sgd_iter"] - * policy.config["replay_buffer_capacity"] - * policy.config["train_batch_size"] - ) - cur_ts = policy._target_and_kl_stats[NUM_AGENT_STEPS_TRAINED] - last_update = policy._target_and_kl_stats[LAST_TARGET_UPDATE_TS] - - # Update target networks on all policy learners. - if cur_ts - last_update > target_update_freq: - policy._target_and_kl_stats[NUM_TARGET_UPDATES] += 1 - policy._target_and_kl_stats[LAST_TARGET_UPDATE_TS] = cur_ts - policy.update_target() - # Also update Policy's current KL coeff. - if policy.config["use_kl_loss"]: - kl = train_results[LEARNER_STATS_KEY].get("kl") - assert kl is not None, train_results - # Make the actual `Policy.update_kl()` call. - policy.update_kl(kl) - - return train_results - - @override(appo.APPO) - def __getstate__(self) -> dict: - state = super().__getstate__() - state.update( - { - "league_builder": self.league_builder.__getstate__(), - } - ) - return state - - @override(appo.APPO) - def __setstate__(self, state: dict) -> None: - state_copy = state.copy() - self.league_builder.__setstate__(state.pop("league_builder", {})) - super().__setstate__(state_copy) diff --git a/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/distributed_learners.py b/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/distributed_learners.py deleted file mode 100644 index 0aa49b74b7ab9..0000000000000 --- a/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/distributed_learners.py +++ /dev/null @@ -1,267 +0,0 @@ -import math -from typing import Any, Dict, List, Optional, Type - -import ray -from ray.actor import ActorHandle -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.actors import create_colocated_actors -from ray.rllib.utils.tf_utils import get_tf_eager_cls_if_necessary -from ray.rllib.utils.typing import AlgorithmConfigDict, PolicyID - - -class DistributedLearners: - """Container class for n learning @ray.remote-turned policies. - - The container contains n "learner shards", each one consisting of one - multi-agent replay buffer and m policy actors that share this replay - buffer. - """ - - def __init__( - self, - *, - config, - max_num_policies_to_train: int, - replay_actor_class: Type[ActorHandle], - replay_actor_args: List[Any], - num_learner_shards: Optional[int] = None, - ): - """Initializes a DistributedLearners instance. - - Args: - config: The Algorithm's config dict. - max_num_policies_to_train: Maximum number of policies that will ever be - trainable. For these policies, we'll have to create remote - policy actors, distributed across n "learner shards". - num_learner_shards: Optional number of "learner shards" to reserve. - Each one consists of one multi-agent replay actor and - m policy actors that share this replay buffer. If None, - will infer this number automatically from the number of GPUs - and the max. number of learning policies. - replay_actor_class: The class to use to produce one multi-agent - replay buffer on each learner shard (shared by all policy actors - on that shard). - replay_actor_args: The args to pass to the remote replay buffer - actor's constructor. - """ - self.config = config - self.num_gpus = self.config.num_gpus - self.max_num_policies_to_train = max_num_policies_to_train - self.replay_actor_class = replay_actor_class - self.replay_actor_args = replay_actor_args - - # Auto-num-learner-shard detection: - # Examples: - # 4 GPUs + max. 10 policies to train -> 4 shards (0.4 GPU/pol). - # 8 GPUs + max. 3 policies to train -> 3 shards (2 GPUs/pol). - # 8 GPUs + max. 2 policies to train -> 2 shards (4 GPUs/pol). - # 2 GPUs + max. 5 policies to train -> 2 shards (0.4 GPUs/pol). - if num_learner_shards is None: - self.num_learner_shards = min( - self.num_gpus or self.max_num_policies_to_train, - self.max_num_policies_to_train, - ) - # - else: - self.num_learner_shards = num_learner_shards - - self.num_gpus_per_shard = self.num_gpus // self.num_learner_shards - if self.num_gpus_per_shard == 0: - self.num_gpus_per_shard = self.num_gpus / self.num_learner_shards - - num_policies_per_shard = ( - self.max_num_policies_to_train / self.num_learner_shards - ) - self.num_gpus_per_policy = self.num_gpus_per_shard / num_policies_per_shard - self.num_policies_per_shard = math.ceil(num_policies_per_shard) - - self.shards = [ - _Shard( - config=self.config, - max_num_policies=self.num_policies_per_shard, - num_gpus_per_policy=self.num_gpus_per_policy, - replay_actor_class=self.replay_actor_class, - replay_actor_args=self.replay_actor_args, - ) - for _ in range(self.num_learner_shards) - ] - - def add_policy(self, policy_id, policy_spec): - # Find first empty slot. - for shard in self.shards: - if shard.max_num_policies > len(shard.policy_actors): - pol_actor = shard.add_policy(policy_id, policy_spec) - return pol_actor - raise RuntimeError("All shards are full!") - - def remove_policy(self): - raise NotImplementedError - - def get_policy_actor(self, policy_id): - for shard in self.shards: - if policy_id in shard.policy_actors: - return shard.policy_actors[policy_id] - raise None - - def get_replay_and_policy_actors(self, policy_id): - for shard in self.shards: - if policy_id in shard.policy_actors: - return shard.replay_actor, shard.policy_actors[policy_id] - return None, None - - def get_policy_id(self, policy_actor): - for shard in self.shards: - for pid, act in shard.policy_actors.items(): - if act == policy_actor: - return pid - raise None - - def get_replay_actors(self): - return [shard.replay_actor for shard in self.shards] - - def stop(self) -> None: - """Terminates all ray actors.""" - for shard in self.shards: - shard.stop() - - def __len__(self): - """Returns the number of all Policy actors in all our shards.""" - return sum(len(s) for s in self.shards) - - def __iter__(self): - def _gen(): - for shard in self.shards: - for pid, policy_actor in shard.policy_actors.items(): - yield pid, policy_actor, shard.replay_actor - - return _gen() - - -class _Shard: - def __init__( - self, - config, - max_num_policies, - num_gpus_per_policy, - replay_actor_class, - replay_actor_args, - ): - # For now, remain in config dict-land (b/c we are dealing with Policy classes - # here which do NOT use AlgorithmConfig yet). - if isinstance(config, AlgorithmConfig): - config = config.to_dict() - self.config = config - self.has_replay_buffer = False - self.max_num_policies = max_num_policies - self.num_gpus_per_policy = num_gpus_per_policy - self.replay_actor_class = replay_actor_class - self.replay_actor_args = replay_actor_args - - self.replay_actor: Optional[ActorHandle] = None - self.policy_actors: Dict[str, ActorHandle] = {} - - def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec): - # Merge the policies config overrides with the main config. - # Also, adjust `num_gpus` (to indicate an individual policy's - # num_gpus, not the total number of GPUs). - cfg = Algorithm.merge_trainer_configs( - self.config, - dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}), - ) - - # Need to create the replay actor first. Then add the first policy. - if self.replay_actor is None: - return self._add_replay_buffer_and_policy(policy_id, policy_spec, cfg) - - # Replay actor already exists -> Just add a new policy here. - - assert len(self.policy_actors) < self.max_num_policies - - actual_policy_class = get_tf_eager_cls_if_necessary( - policy_spec.policy_class, cfg - ) - - colocated = create_colocated_actors( - actor_specs=[ - ( - ray.remote( - num_cpus=1, - num_gpus=self.num_gpus_per_policy - if not cfg["_fake_gpus"] - else 0, - )(actual_policy_class), - # Policy c'tor args. - (policy_spec.observation_space, policy_spec.action_space, cfg), - # Policy c'tor kwargs={}. - {}, - # Count=1, - 1, - ) - ], - # Force co-locate on the already existing replay actor's node. - node=ray.get(self.replay_actor.get_host.remote()), - ) - - self.policy_actors[policy_id] = colocated[0][0] - - return self.policy_actors[policy_id] - - def _add_replay_buffer_and_policy( - self, - policy_id: PolicyID, - policy_spec: PolicySpec, - config: AlgorithmConfigDict, - ): - assert self.replay_actor is None - assert len(self.policy_actors) == 0 - - actual_policy_class = get_tf_eager_cls_if_necessary( - policy_spec.policy_class, config - ) - - if isinstance(config, AlgorithmConfig): - config = config.to_dict() - - colocated = create_colocated_actors( - actor_specs=[ - (self.replay_actor_class, self.replay_actor_args, {}, 1), - ] - + [ - ( - ray.remote( - num_cpus=1, - num_gpus=self.num_gpus_per_policy - if not config["_fake_gpus"] - else 0, - )(actual_policy_class), - # Policy c'tor args. - (policy_spec.observation_space, policy_spec.action_space, config), - # Policy c'tor kwargs={}. - {}, - # Count=1, - 1, - ) - ], - node=None, - ) # None - - self.replay_actor = colocated[0][0] - self.policy_actors[policy_id] = colocated[1][0] - self.has_replay_buffer = True - - return self.policy_actors[policy_id] - - def stop(self): - """Terminates all ray actors (replay and n policy actors).""" - # Terminate the replay actor. - self.replay_actor.__ray_terminate__.remote() - - # Terminate all policy actors. - for pid, policy_actor in self.policy_actors.items(): - policy_actor.__ray_terminate__.remote() - - def __len__(self): - """Returns the number of Policy actors in this shard.""" - return len(self.policy_actors) diff --git a/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/league_builder.py b/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/league_builder.py deleted file mode 100644 index 5c7125c856bff..0000000000000 --- a/rllib_contrib/alpha_star/src/rllib_alpha_star/alpha_star/league_builder.py +++ /dev/null @@ -1,395 +0,0 @@ -import logging -import re -from abc import ABCMeta -from collections import defaultdict -from typing import Any, DefaultDict, Dict - -import numpy as np - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.examples.policy.random_policy import RandomPolicy -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.annotations import ExperimentalAPI, override -from ray.rllib.utils.numpy import softmax -from ray.rllib.utils.typing import PolicyID, ResultDict - -logger = logging.getLogger(__name__) - - -@ExperimentalAPI -class LeagueBuilder(metaclass=ABCMeta): - def __init__(self, algo: Algorithm, algo_config: AlgorithmConfig): - """Initializes a LeagueBuilder instance. - - Args: - algo: The Algorithm object by which this league builder is used. - Algorithm calls `build_league()` after each training step. - algo_config: The (not yet validated) config to be - used on the Algorithm. Child classes of `LeagueBuilder` - should preprocess this to add e.g. multiagent settings - to this config. - """ - self.algo = algo - self.config = algo_config - - def build_league(self, result: ResultDict) -> None: - """Method containing league-building logic. Called after train step. - - Args: - result: The most recent result dict with all necessary stats in - it (e.g. episode rewards) to perform league building - operations. - """ - raise NotImplementedError - - def __getstate__(self) -> Dict[str, Any]: - """Returns a state dict, mapping str keys to state variables. - - Returns: - The current state dict of this LeagueBuilder. - """ - return {} - - -@ExperimentalAPI -class NoLeagueBuilder(LeagueBuilder): - """A LeagueBuilder that does nothing. - - Useful for simple, non-league-building multi-agent setups. - See e.g. - `rllib/tuned_examples/alpha_star/multi-agent-cart-pole-alpha-star.yaml` - """ - - def build_league(self, result: ResultDict) -> None: - pass - - -@ExperimentalAPI -class AlphaStarLeagueBuilder(LeagueBuilder): - def __init__( - self, - algo: Algorithm, - algo_config: AlgorithmConfig, - num_random_policies: int = 2, - num_learning_league_exploiters: int = 4, - num_learning_main_exploiters: int = 4, - win_rate_threshold_for_new_snapshot: float = 0.8, - keep_new_snapshot_training_prob: float = 0.0, - prob_league_exploiter_match: float = 0.33, - prob_main_exploiter_match: float = 0.33, - prob_main_exploiter_playing_against_learning_main: float = 0.5, - ): - """Initializes a AlphaStarLeagueBuilder instance. - - The following match types are possible: - LE: A learning (not snapshot) league_exploiter vs any snapshot policy. - ME: A learning (not snapshot) main exploiter vs any main. - M: Main self-play (main vs main). - - Args: - algo: The Algorithm object by which this league builder is used. - Algorithm calls `build_league()` after each training step to reconfigure - the league structure (e.g. to add/remove policies). - algo_config: The (not yet validated) config to be - used on the Algorithm. Child classes of `LeagueBuilder` - should preprocess this to add e.g. multiagent settings - to this config. - num_random_policies: The number of random policies to add to the - league. This must be an even number (including 0) as these - will be evenly distributed amongst league- and main- exploiters. - num_learning_league_exploiters: The number of initially learning - league-exploiters to create. - num_learning_main_exploiters: The number of initially learning - main-exploiters to create. - win_rate_threshold_for_new_snapshot: The win-rate to be achieved - for a learning policy to get snapshot'd (forked into `self` + - a new learning or non-learning copy of `self`). - keep_new_snapshot_training_prob: The probability with which a new - snapshot should keep training. Note that the policy from which - this snapshot is taken will continue to train regardless. - prob_league_exploiter_match: Probability of an episode to become a - league-exploiter vs snapshot match. - prob_main_exploiter_match: Probability of an episode to become a - main-exploiter vs main match. - prob_main_exploiter_playing_against_learning_main: Probability of - a main-exploiter vs (training!) main match. - """ - super().__init__(algo, algo_config) - - self.win_rate_threshold_for_new_snapshot = win_rate_threshold_for_new_snapshot - self.keep_new_snapshot_training_prob = keep_new_snapshot_training_prob - self.prob_league_exploiter_match = prob_league_exploiter_match - self.prob_main_exploiter_match = prob_main_exploiter_match - self.prob_main_exploiter_playing_against_learning_main = ( - prob_main_exploiter_playing_against_learning_main - ) - # Store the win rates for league overview printouts. - self.win_rates: DefaultDict[PolicyID, float] = defaultdict(float) - - assert num_random_policies % 2 == 0, ( - "ERROR: `num_random_policies` must be even number (we'll distribute " - "these evenly amongst league- and main-exploiters)!" - ) - - # Build trainer's multiagent config. - self.config._is_frozen = False - # Make sure the multiagent config dict has no policies defined: - assert self.config.policies is None, ( - "ERROR: `config.policies` should be None (not pre-defined by user)! " - "AlphaStarLeagueBuilder will construct this itself." - ) - policies = {} - - self.main_policies = 1 - self.league_exploiters = ( - num_learning_league_exploiters + num_random_policies / 2 - ) - self.main_exploiters = num_learning_main_exploiters + num_random_policies / 2 - - # Add 1 initial (learning) main policy. - policies["main_0"] = PolicySpec() - - # Train all non-random policies that exist at beginning. - policies_to_train = ["main_0"] - - # Add random policies. - i = -1 - for i in range(num_random_policies // 2): - policies[f"league_exploiter_{i}"] = PolicySpec(policy_class=RandomPolicy) - policies[f"main_exploiter_{i}"] = PolicySpec(policy_class=RandomPolicy) - # Add initial (learning) league-exploiters. - for j in range(num_learning_league_exploiters): - pid = f"league_exploiter_{j + i + 1}" - policies[pid] = PolicySpec() - policies_to_train.append(pid) - # Add initial (learning) main-exploiters. - for j in range(num_learning_league_exploiters): - pid = f"main_exploiter_{j + i + 1}" - policies[pid] = PolicySpec() - policies_to_train.append(pid) - - # Build initial policy mapping function: main_0 vs main_exploiter_0. - self.config.policy_mapping_fn = ( - lambda agent_id, episode, worker, **kw: "main_0" - if episode.episode_id % 2 == agent_id - else "main_exploiter_0" - ) - self.config.policies = policies - self.config.policies_to_train = policies_to_train - self.config.freeze() - - @override(LeagueBuilder) - def build_league(self, result: ResultDict) -> None: - local_worker = self.algo.workers.local_worker() - - # If no evaluation results -> Use hist data gathered for training. - if "evaluation" in result: - hist_stats = result["evaluation"]["hist_stats"] - else: - hist_stats = result["hist_stats"] - - # TODO: Add example on how to use callable here, instead of updating - # policies_to_train via this simple set. - trainable_policies = local_worker.get_policies_to_train() - non_trainable_policies = ( - set(local_worker.policy_map.keys()) - trainable_policies - ) - - logger.info(f"League building after iter {self.algo.iteration}:") - - # Calculate current win-rates. - for policy_id, rew in hist_stats.items(): - mo = re.match("^policy_(.+)_reward$", policy_id) - if mo is None: - continue - policy_id = mo.group(1) - - # Calculate this policy's win rate. - won = 0 - for r in rew: - if r > 0.0: # win = 1.0; loss = -1.0 - won += 1 - win_rate = won / len(rew) - # TODO: This should probably be a running average - # (instead of hard-overriding it with the most recent data). - self.win_rates[policy_id] = win_rate - - # Policy is a snapshot (frozen) -> Ignore. - if policy_id not in trainable_policies: - continue - - logger.info(f"\t{policy_id} win-rate={win_rate} -> ") - - # If win rate is good enough -> Snapshot current policy and decide, - # whether to freeze the new snapshot or not. - if win_rate >= self.win_rate_threshold_for_new_snapshot: - is_main = re.match("^main(_\\d+)?$", policy_id) - - # Probability that the new snapshot is trainable. - keep_training_p = self.keep_new_snapshot_training_prob - # For main, new snapshots are never trainable, for all others - # use `config.keep_new_snapshot_training_prob` (default: 0.0!). - keep_training = ( - False - if is_main - else np.random.choice( - [True, False], p=[keep_training_p, 1.0 - keep_training_p] - ) - ) - # New league-exploiter policy. - if policy_id.startswith("league_ex"): - new_pol_id = re.sub( - "_\\d+$", f"_{self.league_exploiters}", policy_id - ) - self.league_exploiters += 1 - # New main-exploiter policy. - elif policy_id.startswith("main_ex"): - new_pol_id = re.sub("_\\d+$", f"_{self.main_exploiters}", policy_id) - self.main_exploiters += 1 - # New main policy snapshot. - else: - new_pol_id = re.sub("_\\d+$", f"_{self.main_policies}", policy_id) - self.main_policies += 1 - - if keep_training: - trainable_policies.add(new_pol_id) - else: - non_trainable_policies.add(new_pol_id) - - logger.info( - f"adding new opponents to the mix ({new_pol_id}; " - f"trainable={keep_training})." - ) - - num_main_policies = self.main_policies - probs_match_types = [ - self.prob_league_exploiter_match, - self.prob_main_exploiter_match, - 1.0 - - self.prob_league_exploiter_match - - self.prob_main_exploiter_match, - ] - prob_playing_learning_main = ( - self.prob_main_exploiter_playing_against_learning_main - ) - - # Update our mapping function accordingly. - def policy_mapping_fn(agent_id, episode, worker, **kwargs): - - # Pick, whether this is: - # LE: league-exploiter vs snapshot. - # ME: main-exploiter vs (any) main. - # M: Learning main vs itself. - type_ = np.random.choice(["LE", "ME", "M"], p=probs_match_types) - - # Learning league exploiter vs a snapshot. - # Opponent snapshots should be selected based on a win-rate- - # derived probability. - if type_ == "LE": - if episode.episode_id % 2 == agent_id: - league_exploiter = np.random.choice( - [ - p - for p in trainable_policies - if p.startswith("league_ex") - ] - ) - logger.debug( - f"Episode {episode.episode_id}: AgentID " - f"{agent_id} played by {league_exploiter} (training)" - ) - return league_exploiter - # Play against any non-trainable policy (excluding itself). - else: - all_opponents = list(non_trainable_policies) - probs = softmax( - [ - worker.global_vars["win_rates"][pid] - for pid in all_opponents - ] - ) - opponent = np.random.choice(all_opponents, p=probs) - logger.debug( - f"Episode {episode.episode_id}: AgentID " - f"{agent_id} played by {opponent} (frozen)" - ) - return opponent - - # Learning main exploiter vs (learning main OR snapshot main). - elif type_ == "ME": - if episode.episode_id % 2 == agent_id: - main_exploiter = np.random.choice( - [ - p - for p in trainable_policies - if p.startswith("main_ex") - ] - ) - logger.debug( - f"Episode {episode.episode_id}: AgentID " - f"{agent_id} played by {main_exploiter} (training)" - ) - return main_exploiter - else: - # n% of the time, play against the learning main. - # Also always play againt learning main if no - # non-learning mains have been created yet. - if num_main_policies == 1 or ( - np.random.random() < prob_playing_learning_main - ): - main = "main_0" - training = "training" - # 100-n% of the time, play against a non-learning - # main. Opponent main snapshots should be selected - # based on a win-rate-derived probability. - else: - all_opponents = [ - f"main_{p}" - for p in list(range(1, num_main_policies)) - ] - probs = softmax( - [ - worker.global_vars["win_rates"][pid] - for pid in all_opponents - ] - ) - main = np.random.choice(all_opponents, p=probs) - training = "frozen" - logger.debug( - f"Episode {episode.episode_id}: AgentID " - f"{agent_id} played by {main} ({training})" - ) - return main - - # Main policy: Self-play. - else: - logger.debug(f"Episode {episode.episode_id}: main_0 vs main_0") - return "main_0" - - # Add and set the weights of the new polic(y/ies). - state = self.algo.get_policy(policy_id).get_state() - self.algo.add_policy( - policy_id=new_pol_id, - policy_cls=type(self.algo.get_policy(policy_id)), - policy_state=state, - policy_mapping_fn=policy_mapping_fn, - policies_to_train=trainable_policies, - ) - - else: - logger.info("not good enough; will keep learning ...") - - def __getstate__(self) -> Dict[str, Any]: - return { - "win_rates": self.win_rates, - "main_policies": self.main_policies, - "league_exploiters": self.league_exploiters, - "main_exploiters": self.main_exploiters, - } - - def __setstate__(self, state) -> None: - self.win_rates = state["win_rates"] - self.main_policies = state["main_policies"] - self.league_exploiters = state["league_exploiters"] - self.main_exploiters = state["main_exploiters"] diff --git a/rllib_contrib/alpha_star/tests/test_alpha_star.py b/rllib_contrib/alpha_star/tests/test_alpha_star.py deleted file mode 100644 index c443bc9ef4bd1..0000000000000 --- a/rllib_contrib/alpha_star/tests/test_alpha_star.py +++ /dev/null @@ -1,74 +0,0 @@ -import unittest - -from rllib_alpha_star.alpha_star import AlphaStarConfig - -import ray -from ray.rllib.env.utils import try_import_open_spiel, try_import_pyspiel -from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) -from ray.tune import register_env - -open_spiel = try_import_open_spiel(error=True) -pyspiel = try_import_pyspiel(error=True) - -# Connect-4 OpenSpiel env. -register_env("connect_four", lambda _: OpenSpielEnv(pyspiel.load_game("connect_four"))) - - -class TestAlphaStar(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_alpha_star_compilation(self): - """Test whether AlphaStar can be built with all frameworks.""" - config = ( - AlphaStarConfig() - .environment(env="connect_four") - .training( - gamma=1.0, - model={"fcnet_hiddens": [256, 256, 256]}, - vf_loss_coeff=0.01, - entropy_coeff=0.004, - league_builder_config={ - "win_rate_threshold_for_new_snapshot": 0.8, - "num_random_policies": 2, - "num_learning_league_exploiters": 1, - "num_learning_main_exploiters": 1, - }, - grad_clip=10.0, - replay_buffer_capacity=10, - replay_buffer_replay_ratio=0.0, - use_kl_loss=True, - ) - .rollouts(num_rollout_workers=1, num_envs_per_worker=5) - .resources(num_gpus=2, _fake_gpus=True) - ) - - num_iterations = 2 - - for _ in framework_iterator(config, with_eager_tracing=True): - config.policies = None - algo = config.build() - for i in range(num_iterations): - results = algo.train() - print(results) - check_train_results(results) - check_compute_single_action(algo) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/alpha_star/tuned_examples/__init__.py b/rllib_contrib/alpha_star/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/alpha_star/tuned_examples/multi-agent-cartpole-alpha-star.yaml b/rllib_contrib/alpha_star/tuned_examples/multi-agent-cartpole-alpha-star.yaml deleted file mode 100644 index 0ee00dc6e18b7..0000000000000 --- a/rllib_contrib/alpha_star/tuned_examples/multi-agent-cartpole-alpha-star.yaml +++ /dev/null @@ -1,43 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -multi-agent-cartpole-alpha-star: - env: ray.rllib.examples.env.multi_agent.MultiAgentCartPole - run: AlphaStar - stop: - sampler_results/episode_reward_mean: 600 # 600 / 4 (==num_agents) = 150 - timesteps_total: 200000 - config: - # Works for both torch and tf. - framework: torch - - # 4-agent MA cartpole. - env_config: - config: - num_agents: 4 - - num_envs_per_worker: 5 - num_workers: 4 - num_gpus: 1 - _fake_gpus: true - - observation_filter: MeanStdFilter - num_sgd_iter: 1 - vf_loss_coeff: 0.005 - vtrace: true - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - - replay_buffer_replay_ratio: 0.0 - - # No league-building needed. - league_builder_config: - type: ray.rllib.algorithms.alpha_star.league_builder.NoLeagueBuilder - - multiagent: - policies: ["p0", "p1", "p2", "p3"] - # YAML-capable policy_mapping_fn definition via providing a callable class here. - policy_mapping_fn: - type: ray.rllib.examples.multi_agent_and_self_play.policy_mapping_fn.PolicyMappingFn diff --git a/rllib_contrib/alpha_zero/BUILD b/rllib_contrib/alpha_zero/BUILD deleted file mode 100644 index 5b6f8915437ec..0000000000000 --- a/rllib_contrib/alpha_zero/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# Examples - -py_test( - name = "example_alpha_zero_cartpole_sparse_rewards", - main = "alpha_zero_cartpole_sparse_rewards.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/alpha_zero_cartpole_sparse_rewards.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_cartpole_sparse_rewards_alpha_zero", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], - main = "run_regression_tests.py", - size = "medium", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-sparse-rewards-alpha-zero.yaml"], - args = ["--dir=alpha_zero/tuned_examples/", "--num-cpus=8"] -) - -# Compilation Tests - -py_test( - name = "test_alpha_zero", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_alpha_zero.py"] -) diff --git a/rllib_contrib/alpha_zero/README.md b/rllib_contrib/alpha_zero/README.md deleted file mode 100644 index ee5e3f9675cda..0000000000000 --- a/rllib_contrib/alpha_zero/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# Alpha Zero - -[Alpha Zero](https://arxiv.org/abs/1712.01815) is a general reinforcement learning approach that achieved superhuman performance in the games of chess, shogi, and Go through tabula rasa learning from games of self-play, surpassing previous state-of-the-art programs that relied on handcrafted evaluation functions and domain-specific adaptations. - -## Installation - -``` -conda create -n rllib-alpha-zero python=3.10 -conda activate rllib-alpha-zero -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[AlphaZero Example]() \ No newline at end of file diff --git a/rllib_contrib/alpha_zero/examples/alpha_zero_cartpole_sparse_rewards.py b/rllib_contrib/alpha_zero/examples/alpha_zero_cartpole_sparse_rewards.py deleted file mode 100644 index 739effeec1781..0000000000000 --- a/rllib_contrib/alpha_zero/examples/alpha_zero_cartpole_sparse_rewards.py +++ /dev/null @@ -1,73 +0,0 @@ -import argparse - -from rllib_alpha_zero.alpha_zero import AlphaZero, AlphaZeroConfig -from rllib_alpha_zero.alpha_zero.custom_torch_models import DenseModel - -import ray -from ray import air, tune -from ray.rllib.examples.env.cartpole_sparse_rewards import CartPoleSparseRewards -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - AlphaZeroConfig() - .rollouts( - num_rollout_workers=6, - rollout_fragment_length=50, - ) - .framework("torch") - .environment(CartPoleSparseRewards) - .training( - train_batch_size=500, - sgd_minibatch_size=64, - lr=1e-4, - num_sgd_iter=1, - mcts_config={ - "puct_coefficient": 1.5, - "num_simulations": 100, - "temperature": 1.0, - "dirichlet_epsilon": 0.20, - "dirichlet_noise": 0.03, - "argmax_tree_policy": False, - "add_dirichlet_noise": True, - }, - ranked_rewards={ - "enable": True, - }, - model={ - "custom_model": DenseModel, - }, - ) - ) - - stop_reward = 30.0 - - tuner = tune.Tuner( - AlphaZero, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 100000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/alpha_zero/pyproject.toml b/rllib_contrib/alpha_zero/pyproject.toml deleted file mode 100644 index c9623f61b3ff0..0000000000000 --- a/rllib_contrib/alpha_zero/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-alpha-zero" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium==0.26.3", "ray[rllib]==2.5.1"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/alpha_zero/requirements.txt b/rllib_contrib/alpha_zero/requirements.txt deleted file mode 100644 index b07006a1b4ec6..0000000000000 --- a/rllib_contrib/alpha_zero/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/__init__.py b/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/__init__.py deleted file mode 100644 index 2a4d9cd92e08b..0000000000000 --- a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from rllib_alpha_zero.alpha_zero.alpha_zero import ( - AlphaZero, - AlphaZeroConfig, - AlphaZeroDefaultCallbacks, -) -from rllib_alpha_zero.alpha_zero.alpha_zero_policy import AlphaZeroPolicy - -from ray.tune.registry import register_trainable - -__all__ = [ - "AlphaZeroConfig", - "AlphaZero", - "AlphaZeroDefaultCallbacks", - "AlphaZeroPolicy", -] - -register_trainable("rllib-contrib-alpha-zero", AlphaZero) diff --git a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/alpha_zero.py b/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/alpha_zero.py deleted file mode 100644 index 46835247e3433..0000000000000 --- a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/alpha_zero.py +++ /dev/null @@ -1,406 +0,0 @@ -import logging -from typing import List, Optional, Type, Union - -from rllib_alpha_zero.alpha_zero.alpha_zero_policy import AlphaZeroPolicy -from rllib_alpha_zero.alpha_zero.mcts import MCTS -from rllib_alpha_zero.alpha_zero.ranked_rewards import get_r2_env_wrapper - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample -from ray.rllib.execution.train_ops import multi_gpu_train_one_step, train_one_step -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import restore_original_dimensions -from ray.rllib.models.torch.torch_action_dist import TorchCategorical -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import concat_samples -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics import ( - NUM_AGENT_STEPS_SAMPLED, - NUM_ENV_STEPS_SAMPLED, - SAMPLE_TIMER, - SYNCH_WORKER_WEIGHTS_TIMER, -) -from ray.rllib.utils.replay_buffers.utils import validate_buffer_config -from ray.rllib.utils.typing import ResultDict - -torch, nn = try_import_torch() - -logger = logging.getLogger(__name__) - - -class AlphaZeroDefaultCallbacks(DefaultCallbacks): - """AlphaZero callbacks. - - If you use custom callbacks, you must extend this class and call super() - for on_episode_start. - """ - - def on_episode_start(self, worker, base_env, policies, episode, **kwargs): - # Save environment's state when an episode starts. - env = base_env.get_sub_environments()[0] - state = env.get_state() - episode.user_data["initial_state"] = state - - -class AlphaZeroConfig(AlgorithmConfig): - """Defines a configuration class from which an AlphaZero Algorithm can be built. - - Example: - >>> from rllib_alpha_zero.alpha_zero import AlphaZeroConfig - >>> config = AlphaZeroConfig() # doctest: +SKIP - >>> config = config.training(sgd_minibatch_size=256) # doctest: +SKIP - >>> config = config..resources(num_gpus=0) # doctest: +SKIP - >>> config = config..rollouts(num_rollout_workers=4) # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from rllib_alpha_zero.alpha_zero import AlphaZeroConfig - >>> from ray import air - >>> from ray import tune - >>> config = AlphaZeroConfig() - >>> # Print out some default values. - >>> print(config.shuffle_sequences) # doctest: +SKIP - >>> # Update the config object. - >>> config.training(lr=tune.grid_search([0.001, 0.0001])) # doctest: +SKIP - >>> # Set the config object's env. - >>> config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "AlphaZero", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a PPOConfig instance.""" - super().__init__(algo_class=algo_class or AlphaZero) - - # fmt: off - # __sphinx_doc_begin__ - # AlphaZero specific config settings: - self.sgd_minibatch_size = 128 - self.shuffle_sequences = True - self.num_sgd_iter = 30 - self.replay_buffer_config = { - "type": "ReplayBuffer", - # Size of the replay buffer in batches (not timesteps!). - "capacity": 1000, - # Choosing `fragments` here makes it so that the buffer stores entire - # batches, instead of sequences, episodes or timesteps. - "storage_unit": "fragments", - } - # Number of timesteps to collect from rollout workers before we start - # sampling from replay buffers for learning. Whether we count this in agent - # steps or environment steps depends on config.multi_agent(count_steps_by=..). - self.num_steps_sampled_before_learning_starts = 1000 - self.lr_schedule = None - self.vf_share_layers = False - self.mcts_config = { - "puct_coefficient": 1.0, - "num_simulations": 30, - "temperature": 1.5, - "dirichlet_epsilon": 0.25, - "dirichlet_noise": 0.03, - "argmax_tree_policy": False, - "add_dirichlet_noise": True, - } - self.ranked_rewards = { - "enable": True, - "percentile": 75, - "buffer_max_length": 1000, - # add rewards obtained from random policy to - # "warm start" the buffer - "initialize_buffer": True, - "num_init_rewards": 100, - } - - # Override some of AlgorithmConfig's default values with AlphaZero-specific - # values. - self.framework_str = "torch" - self.callbacks_class = AlphaZeroDefaultCallbacks - self.lr = 5e-5 - self.num_rollout_workers = 2 - self.rollout_fragment_length = 200 - self.train_batch_size = 4000 - self.batch_mode = "complete_episodes" - # Extra configuration that disables exploration. - self.evaluation(evaluation_config={ - "mcts_config": { - "argmax_tree_policy": True, - "add_dirichlet_noise": False, - }, - }) - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # __sphinx_doc_end__ - # fmt: on - - self.buffer_size = DEPRECATED_VALUE - - @override(AlgorithmConfig) - def training( - self, - *, - sgd_minibatch_size: Optional[int] = NotProvided, - shuffle_sequences: Optional[bool] = NotProvided, - num_sgd_iter: Optional[int] = NotProvided, - replay_buffer_config: Optional[dict] = NotProvided, - lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, - vf_share_layers: Optional[bool] = NotProvided, - mcts_config: Optional[dict] = NotProvided, - ranked_rewards: Optional[dict] = NotProvided, - num_steps_sampled_before_learning_starts: Optional[int] = NotProvided, - **kwargs, - ) -> "AlphaZeroConfig": - """Sets the training related configuration. - - Args: - sgd_minibatch_size: Total SGD batch size across all devices for SGD. - shuffle_sequences: Whether to shuffle sequences in the batch when training - (recommended). - num_sgd_iter: Number of SGD iterations in each outer loop. - replay_buffer_config: Replay buffer config. - Examples: - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentReplayBuffer", - "learning_starts": 1000, - "capacity": 50000, - "replay_sequence_length": 1, - } - - OR - - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 50000, - "prioritized_replay_alpha": 0.6, - "prioritized_replay_beta": 0.4, - "prioritized_replay_eps": 1e-6, - "replay_sequence_length": 1, - } - - Where - - prioritized_replay_alpha: Alpha parameter controls the degree of - prioritization in the buffer. In other words, when a buffer sample has - a higher temporal-difference error, with how much more probability - should it drawn to use to update the parametrized Q-network. 0.0 - corresponds to uniform probability. Setting much above 1.0 may quickly - result as the sampling distribution could become heavily “pointy” with - low entropy. - prioritized_replay_beta: Beta parameter controls the degree of - importance sampling which suppresses the influence of gradient updates - from samples that have higher probability of being sampled via alpha - parameter and the temporal-difference error. - prioritized_replay_eps: Epsilon parameter sets the baseline probability - for sampling so that when the temporal-difference error of a sample is - zero, there is still a chance of drawing the sample. - lr_schedule: Learning rate schedule. In the format of - [[timestep, lr-value], [timestep, lr-value], ...] - Intermediary timesteps will be assigned to interpolated learning rate - values. A schedule should normally start from timestep 0. - vf_share_layers: Share layers for value function. If you set this to True, - it's important to tune vf_loss_coeff. - mcts_config: MCTS specific settings. - ranked_rewards: Settings for the ranked reward (r2) algorithm - from: https://arxiv.org/pdf/1807.01672.pdf - num_steps_sampled_before_learning_starts: Number of timesteps to collect - from rollout workers before we start sampling from replay buffers for - learning. Whether we count this in agent steps or environment steps - depends on config.multi_agent(count_steps_by=..). - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if sgd_minibatch_size is not NotProvided: - self.sgd_minibatch_size = sgd_minibatch_size - if shuffle_sequences is not NotProvided: - self.shuffle_sequences = shuffle_sequences - if num_sgd_iter is not NotProvided: - self.num_sgd_iter = num_sgd_iter - if replay_buffer_config is not NotProvided: - self.replay_buffer_config = replay_buffer_config - if lr_schedule is not NotProvided: - self.lr_schedule = lr_schedule - if vf_share_layers is not NotProvided: - self.vf_share_layers = vf_share_layers - if mcts_config is not NotProvided: - self.mcts_config = mcts_config - if ranked_rewards is not NotProvided: - self.ranked_rewards.update(ranked_rewards) - if num_steps_sampled_before_learning_starts is not NotProvided: - self.num_steps_sampled_before_learning_starts = ( - num_steps_sampled_before_learning_starts - ) - - return self - - @override(AlgorithmConfig) - def update_from_dict(self, config_dict) -> "AlphaZeroConfig": - config_dict = config_dict.copy() - - if "ranked_rewards" in config_dict: - value = config_dict.pop("ranked_rewards") - self.training(ranked_rewards=value) - - return super().update_from_dict(config_dict) - - @override(AlgorithmConfig) - def validate(self) -> None: - """Checks and updates the config based on settings.""" - # Call super's validation method. - super().validate() - validate_buffer_config(self) - - -def alpha_zero_loss(policy, model, dist_class, train_batch): - # get inputs unflattened inputs - input_dict = restore_original_dimensions( - train_batch["obs"], policy.observation_space, "torch" - ) - # forward pass in model - model_out = model.forward(input_dict, None, [1]) - logits, _ = model_out - values = model.value_function() - logits, values = torch.squeeze(logits), torch.squeeze(values) - priors = nn.Softmax(dim=-1)(logits) - # compute actor and critic losses - policy_loss = torch.mean( - -torch.sum(train_batch["mcts_policies"] * torch.log(priors), dim=-1) - ) - value_loss = torch.mean(torch.pow(values - train_batch["value_label"], 2)) - # compute total loss - total_loss = (policy_loss + value_loss) / 2 - return total_loss, policy_loss, value_loss - - -class AlphaZeroPolicyWrapperClass(AlphaZeroPolicy): - def __init__(self, obs_space, action_space, config): - model = ModelCatalog.get_model_v2( - obs_space, action_space, action_space.n, config["model"], "torch" - ) - _, env_creator = Algorithm._get_env_id_and_creator(config["env"], config) - if config["ranked_rewards"]["enable"]: - # if r2 is enabled, tne env is wrapped to include a rewards buffer - # used to normalize rewards - env_cls = get_r2_env_wrapper(env_creator, config["ranked_rewards"]) - - # the wrapped env is used only in the mcts, not in the - # rollout workers - def _env_creator(): - return env_cls(config["env_config"]) - - else: - - def _env_creator(): - return env_creator(config["env_config"]) - - def mcts_creator(): - return MCTS(model, config["mcts_config"]) - - super().__init__( - obs_space, - action_space, - config, - model, - alpha_zero_loss, - TorchCategorical, - mcts_creator, - _env_creator, - ) - - -class AlphaZero(Algorithm): - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return AlphaZeroConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - return AlphaZeroPolicyWrapperClass - - @override(Algorithm) - def training_step(self) -> ResultDict: - """TODO: - - Returns: - The results dict from executing the training iteration. - """ - - # Sample n MultiAgentBatches from n workers. - with self._timers[SAMPLE_TIMER]: - new_sample_batches = synchronous_parallel_sample( - worker_set=self.workers, concat=False - ) - - for batch in new_sample_batches: - # Update sampling step counters. - self._counters[NUM_ENV_STEPS_SAMPLED] += batch.env_steps() - self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps() - # Store new samples in the replay buffer - if self.local_replay_buffer is not None: - self.local_replay_buffer.add(batch) - - if self.local_replay_buffer is not None: - # Update target network every `target_network_update_freq` sample steps. - cur_ts = self._counters[ - NUM_AGENT_STEPS_SAMPLED - if self.config.count_steps_by == "agent_steps" - else NUM_ENV_STEPS_SAMPLED - ] - - if cur_ts > self.config.num_steps_sampled_before_learning_starts: - train_batch = self.local_replay_buffer.sample( - self.config.train_batch_size - ) - else: - train_batch = None - else: - train_batch = concat_samples(new_sample_batches) - - # Learn on the training batch. - # Use simple optimizer (only for multi-agent or tf-eager; all other - # cases should use the multi-GPU optimizer, even if only using 1 GPU) - train_results = {} - if train_batch is not None: - if self.config.get("simple_optimizer") is True: - train_results = train_one_step(self, train_batch) - else: - train_results = multi_gpu_train_one_step(self, train_batch) - - # TODO: Move training steps counter update outside of `train_one_step()` method. - # # Update train step counters. - # self._counters[NUM_ENV_STEPS_TRAINED] += train_batch.env_steps() - # self._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps() - - # Update weights and global_vars - after learning on the local worker - on all - # remote workers. - global_vars = { - "timestep": self._counters[NUM_ENV_STEPS_SAMPLED], - } - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - self.workers.sync_weights(global_vars=global_vars) - - # Return all collected metrics for the iteration. - return train_results diff --git a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/alpha_zero_policy.py b/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/alpha_zero_policy.py deleted file mode 100644 index a0f4c9cd63b7c..0000000000000 --- a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/alpha_zero_policy.py +++ /dev/null @@ -1,158 +0,0 @@ -import numpy as np -from rllib_alpha_zero.alpha_zero.mcts import Node, RootParentNode - -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.torch_policy import TorchPolicy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY - -torch, _ = try_import_torch() - - -class AlphaZeroPolicy(TorchPolicy): - def __init__( - self, - observation_space, - action_space, - config, - model, - loss, - action_distribution_class, - mcts_creator, - env_creator, - **kwargs - ): - super().__init__( - observation_space, - action_space, - config, - model=model, - loss=loss, - action_distribution_class=action_distribution_class, - ) - # we maintain an env copy in the policy that is used during mcts - # simulations - self.env_creator = env_creator - self.mcts = mcts_creator() - self.env = self.env_creator() - self.env.reset() - self.obs_space = observation_space - - @override(TorchPolicy) - def compute_actions( - self, - obs_batch, - state_batches=None, - prev_action_batch=None, - prev_reward_batch=None, - info_batch=None, - episodes=None, - **kwargs - ): - - input_dict = {"obs": obs_batch} - if prev_action_batch is not None: - input_dict["prev_actions"] = prev_action_batch - if prev_reward_batch is not None: - input_dict["prev_rewards"] = prev_reward_batch - - return self.compute_actions_from_input_dict( - input_dict=input_dict, - episodes=episodes, - state_batches=state_batches, - ) - - @override(Policy) - def compute_actions_from_input_dict( - self, input_dict, explore=None, timestep=None, episodes=None, **kwargs - ): - with torch.no_grad(): - actions = [] - for i, episode in enumerate(episodes): - if episode.length == 0: - # if first time step of episode, get initial env state - env_state = episode.user_data["initial_state"] - # verify if env has been wrapped for ranked rewards - if self.env.__class__.__name__ == "RankedRewardsEnvWrapper": - # r2 env state contains also the rewards buffer state - env_state = {"env_state": env_state, "buffer_state": None} - # create tree root node - obs = self.env.set_state(env_state) - tree_node = Node( - state=env_state, - obs=obs, - reward=0, - done=False, - action=None, - parent=RootParentNode(env=self.env), - mcts=self.mcts, - ) - else: - # otherwise get last root node from previous time step - tree_node = episode.user_data["tree_node"] - - # run monte carlo simulations to compute the actions - # and record the tree - mcts_policy, action, tree_node = self.mcts.compute_action(tree_node) - # record action - actions.append(action) - # store new node - episode.user_data["tree_node"] = tree_node - - # store mcts policies vectors and current tree root node - if episode.length == 0: - episode.user_data["mcts_policies"] = [mcts_policy] - else: - episode.user_data["mcts_policies"].append(mcts_policy) - - return ( - np.array(actions), - [], - self.extra_action_out( - input_dict, kwargs.get("state_batches", []), self.model, None - ), - ) - - @override(Policy) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - # add mcts policies to sample batch - sample_batch["mcts_policies"] = np.array(episode.user_data["mcts_policies"])[ - sample_batch["t"] - ] - # final episode reward corresponds to the value (if not discounted) - # for all transitions in episode - final_reward = sample_batch["rewards"][-1] - # if r2 is enabled, then add the reward to the buffer and normalize it - if self.env.__class__.__name__ == "RankedRewardsEnvWrapper": - self.env.r2_buffer.add_reward(final_reward) - final_reward = self.env.r2_buffer.normalize(final_reward) - sample_batch["value_label"] = final_reward * np.ones_like(sample_batch["t"]) - return sample_batch - - @override(TorchPolicy) - def learn_on_batch(self, postprocessed_batch): - train_batch = self._lazy_tensor_dict(postprocessed_batch) - - loss_out, policy_loss, value_loss = self._loss( - self, self.model, self.dist_class, train_batch - ) - self._optimizers[0].zero_grad() - loss_out.backward() - - grad_process_info = self.extra_grad_process(self._optimizers[0], loss_out) - self._optimizers[0].step() - - grad_info = self.extra_grad_info(train_batch) - grad_info.update(grad_process_info) - grad_info.update( - { - "total_loss": loss_out.detach().cpu().numpy(), - "policy_loss": policy_loss.detach().cpu().numpy(), - "value_loss": value_loss.detach().cpu().numpy(), - } - ) - - return {LEARNER_STATS_KEY: grad_info} diff --git a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/custom_torch_models.py b/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/custom_torch_models.py deleted file mode 100644 index 9fc7d1037b69c..0000000000000 --- a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/custom_torch_models.py +++ /dev/null @@ -1,116 +0,0 @@ -from abc import ABC - -import numpy as np - -from ray.rllib.models.modelv2 import restore_original_dimensions -from ray.rllib.models.preprocessors import get_preprocessor -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.utils.framework import try_import_torch - -torch, nn = try_import_torch() - - -def convert_to_tensor(arr): - tensor = torch.from_numpy(np.asarray(arr)) - if tensor.dtype == torch.double: - tensor = tensor.float() - return tensor - - -class ActorCriticModel(TorchModelV2, nn.Module, ABC): - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - TorchModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - nn.Module.__init__(self) - - self.preprocessor = get_preprocessor(obs_space.original_space)( - obs_space.original_space - ) - - self.shared_layers = None - self.actor_layers = None - self.critic_layers = None - - self._value_out = None - - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"] - x = self.shared_layers(x) - # actor outputs - logits = self.actor_layers(x) - - # compute value - self._value_out = self.critic_layers(x) - return logits, None - - def value_function(self): - return self._value_out - - def compute_priors_and_value(self, obs): - obs = convert_to_tensor([self.preprocessor.transform(obs)]) - input_dict = restore_original_dimensions(obs, self.obs_space, "torch") - - with torch.no_grad(): - model_out = self.forward(input_dict, None, [1]) - logits, _ = model_out - value = self.value_function() - logits, value = torch.squeeze(logits), torch.squeeze(value) - priors = nn.Softmax(dim=-1)(logits) - - priors = priors.cpu().numpy() - value = value.cpu().numpy() - - return priors, value - - -class Flatten(nn.Module): - def forward(self, input): - return input.view(input.size(0), -1) - - -class ConvNetModel(ActorCriticModel): - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - ActorCriticModel.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - - in_channels = model_config["custom_model_config"]["in_channels"] - feature_dim = model_config["custom_model_config"]["feature_dim"] - - self.shared_layers = nn.Sequential( - nn.Conv2d(in_channels, 32, kernel_size=4, stride=2), - nn.Conv2d(32, 64, kernel_size=2, stride=1), - nn.Conv2d(64, 64, kernel_size=2, stride=1), - Flatten(), - nn.Linear(1024, feature_dim), - ) - - self.actor_layers = nn.Sequential( - nn.Linear(in_features=feature_dim, out_features=action_space.n) - ) - - self.critic_layers = nn.Sequential( - nn.Linear(in_features=feature_dim, out_features=1) - ) - - self._value_out = None - - -class DenseModel(ActorCriticModel): - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - ActorCriticModel.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - - self.shared_layers = nn.Sequential( - nn.Linear( - in_features=obs_space.original_space["obs"].shape[0], out_features=256 - ), - nn.Linear(in_features=256, out_features=256), - ) - self.actor_layers = nn.Sequential( - nn.Linear(in_features=256, out_features=action_space.n) - ) - self.critic_layers = nn.Sequential(nn.Linear(in_features=256, out_features=1)) - self._value_out = None diff --git a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/mcts.py b/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/mcts.py deleted file mode 100644 index 72f9712bbf3a4..0000000000000 --- a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/mcts.py +++ /dev/null @@ -1,157 +0,0 @@ -""" -Mcts implementation modified from -https://github.com/brilee/python_uct/blob/master/numpy_impl.py -""" -import collections -import math - -import numpy as np - - -class Node: - def __init__(self, action, obs, done, reward, state, mcts, parent=None): - self.env = parent.env - self.action = action # Action used to go to this state - - self.is_expanded = False - self.parent = parent - self.children = {} - - self.action_space_size = self.env.action_space.n - self.child_total_value = np.zeros( - [self.action_space_size], dtype=np.float32 - ) # Q - self.child_priors = np.zeros([self.action_space_size], dtype=np.float32) # P - self.child_number_visits = np.zeros( - [self.action_space_size], dtype=np.float32 - ) # N - self.valid_actions = obs["action_mask"].astype(np.bool_) - - self.reward = reward - self.done = done - self.state = state - self.obs = obs - - self.mcts = mcts - - @property - def number_visits(self): - return self.parent.child_number_visits[self.action] - - @number_visits.setter - def number_visits(self, value): - self.parent.child_number_visits[self.action] = value - - @property - def total_value(self): - return self.parent.child_total_value[self.action] - - @total_value.setter - def total_value(self, value): - self.parent.child_total_value[self.action] = value - - def child_Q(self): - # TODO (weak todo) add "softmax" version of the Q-value - return self.child_total_value / (1 + self.child_number_visits) - - def child_U(self): - return ( - math.sqrt(self.number_visits) - * self.child_priors - / (1 + self.child_number_visits) - ) - - def best_action(self): - """ - :return: action - """ - child_score = self.child_Q() + self.mcts.c_puct * self.child_U() - masked_child_score = child_score - masked_child_score[~self.valid_actions] = -np.inf - return np.argmax(masked_child_score) - - def select(self): - current_node = self - while current_node.is_expanded: - best_action = current_node.best_action() - current_node = current_node.get_child(best_action) - return current_node - - def expand(self, child_priors): - self.is_expanded = True - self.child_priors = child_priors - - def get_child(self, action): - if action not in self.children: - self.env.set_state(self.state) - obs, reward, terminated, truncated, _ = self.env.step(action) - next_state = self.env.get_state() - self.children[action] = Node( - state=next_state, - action=action, - parent=self, - reward=reward, - done=terminated, - obs=obs, - mcts=self.mcts, - ) - return self.children[action] - - def backup(self, value): - current = self - while current.parent is not None: - current.number_visits += 1 - current.total_value += value - current = current.parent - - -class RootParentNode: - def __init__(self, env): - self.parent = None - self.child_total_value = collections.defaultdict(float) - self.child_number_visits = collections.defaultdict(float) - self.env = env - - -class MCTS: - def __init__(self, model, mcts_param): - self.model = model - self.temperature = mcts_param["temperature"] - self.dir_epsilon = mcts_param["dirichlet_epsilon"] - self.dir_noise = mcts_param["dirichlet_noise"] - self.num_sims = mcts_param["num_simulations"] - self.exploit = mcts_param["argmax_tree_policy"] - self.add_dirichlet_noise = mcts_param["add_dirichlet_noise"] - self.c_puct = mcts_param["puct_coefficient"] - - def compute_action(self, node): - for _ in range(self.num_sims): - leaf = node.select() - if leaf.done: - value = leaf.reward - else: - child_priors, value = self.model.compute_priors_and_value(leaf.obs) - if self.add_dirichlet_noise: - child_priors = (1 - self.dir_epsilon) * child_priors - child_priors += self.dir_epsilon * np.random.dirichlet( - [self.dir_noise] * child_priors.size - ) - - leaf.expand(child_priors) - leaf.backup(value) - - # Tree policy target (TPT) - tree_policy = node.child_number_visits / node.number_visits - tree_policy = tree_policy / np.max( - tree_policy - ) # to avoid overflows when computing softmax - tree_policy = np.power(tree_policy, self.temperature) - tree_policy = tree_policy / np.sum(tree_policy) - if self.exploit: - # if exploit then choose action that has the maximum - # tree policy probability - action = np.argmax(tree_policy) - else: - # otherwise sample an action according to tree policy probabilities - action = np.random.choice(np.arange(node.action_space_size), p=tree_policy) - return tree_policy, action, node.children[action] diff --git a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/ranked_rewards.py b/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/ranked_rewards.py deleted file mode 100644 index 198571a06d76e..0000000000000 --- a/rllib_contrib/alpha_zero/src/rllib_alpha_zero/alpha_zero/ranked_rewards.py +++ /dev/null @@ -1,78 +0,0 @@ -from copy import deepcopy - -import numpy as np - - -class RankedRewardsBuffer: - def __init__(self, buffer_max_length, percentile): - self.buffer_max_length = buffer_max_length - self.percentile = percentile - self.buffer = [] - - def add_reward(self, reward): - if len(self.buffer) < self.buffer_max_length: - self.buffer.append(reward) - else: - self.buffer = self.buffer[1:] + [reward] - - def normalize(self, reward): - reward_threshold = np.percentile(self.buffer, self.percentile) - if reward < reward_threshold: - return -1.0 - else: - return 1.0 - - def get_state(self): - return np.array(self.buffer) - - def set_state(self, state): - if state is not None: - self.buffer = list(state) - - -def get_r2_env_wrapper(env_creator, r2_config): - class RankedRewardsEnvWrapper: - def __init__(self, env_config): - self.env = env_creator(env_config) - self.action_space = self.env.action_space - self.observation_space = self.env.observation_space - max_buffer_length = r2_config["buffer_max_length"] - percentile = r2_config["percentile"] - self.r2_buffer = RankedRewardsBuffer(max_buffer_length, percentile) - if r2_config["initialize_buffer"]: - self._initialize_buffer(r2_config["num_init_rewards"]) - - def _initialize_buffer(self, num_init_rewards=100): - # initialize buffer with random policy - for _ in range(num_init_rewards): - obs, info = self.env.reset() - terminated = truncated = False - while not terminated and not truncated: - mask = obs["action_mask"] - probs = mask / mask.sum() - action = np.random.choice(np.arange(mask.shape[0]), p=probs) - obs, reward, terminated, truncated, _ = self.env.step(action) - self.r2_buffer.add_reward(reward) - - def step(self, action): - obs, reward, terminated, truncated, info = self.env.step(action) - if terminated or truncated: - reward = self.r2_buffer.normalize(reward) - return obs, reward, terminated, truncated, info - - def get_state(self): - state = { - "env_state": self.env.get_state(), - "buffer_state": self.r2_buffer.get_state(), - } - return deepcopy(state) - - def reset(self, *, seed=None, options=None): - return self.env.reset() - - def set_state(self, state): - obs = self.env.set_state(state["env_state"]) - self.r2_buffer.set_state(state["buffer_state"]) - return obs - - return RankedRewardsEnvWrapper diff --git a/rllib_contrib/alpha_zero/tests/test_alpha_zero.py b/rllib_contrib/alpha_zero/tests/test_alpha_zero.py deleted file mode 100644 index 4579b44dd028f..0000000000000 --- a/rllib_contrib/alpha_zero/tests/test_alpha_zero.py +++ /dev/null @@ -1,44 +0,0 @@ -import unittest - -import rllib_alpha_zero.alpha_zero as az -from rllib_alpha_zero.alpha_zero.custom_torch_models import DenseModel - -import ray -from ray.rllib.examples.env.cartpole_sparse_rewards import CartPoleSparseRewards -from ray.rllib.utils.test_utils import check_train_results, framework_iterator - - -class TestAlphaZero(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_alpha_zero_compilation(self): - """Test whether AlphaZero can be built with all frameworks.""" - config = ( - az.AlphaZeroConfig() - .environment(env=CartPoleSparseRewards) - .training(model={"custom_model": DenseModel}) - ) - num_iterations = 1 - - # Only working for torch right now. - for _ in framework_iterator(config, frameworks="torch"): - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/alpha_zero/tuned_examples/__init__.py b/rllib_contrib/alpha_zero/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/alpha_zero/tuned_examples/cartpole-sparse-rewards-alpha-zero.yaml b/rllib_contrib/alpha_zero/tuned_examples/cartpole-sparse-rewards-alpha-zero.yaml deleted file mode 100644 index 5d0f99244b990..0000000000000 --- a/rllib_contrib/alpha_zero/tuned_examples/cartpole-sparse-rewards-alpha-zero.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-sparse-rewards-alpha-zero: - env: ray.rllib.examples.env.cartpole_sparse_rewards.CartPoleSparseRewards - run: AlphaZero - stop: - sampler_results/episode_reward_mean: 30.0 - timesteps_total: 100000 - config: - # Only supported for torch right now. - framework: torch - num_workers: 6 - rollout_fragment_length: 50 - train_batch_size: 500 - sgd_minibatch_size: 64 - lr: 0.0001 - num_sgd_iter: 1 - mcts_config: - puct_coefficient: 1.5 - num_simulations: 100 - temperature: 1.0 - dirichlet_epsilon: 0.20 - dirichlet_noise: 0.03 - argmax_tree_policy: false - add_dirichlet_noise: true - ranked_rewards: - enable: true - model: - custom_model: ray.rllib.algorithms.alpha_zero.models.custom_torch_models.DenseModel diff --git a/rllib_contrib/apex_ddpg/BUILD b/rllib_contrib/apex_ddpg/BUILD deleted file mode 100644 index c3bc3c8f04028..0000000000000 --- a/rllib_contrib/apex_ddpg/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -# Examples - -py_test( - name = "example_apex_ddpg_pendulum_v1", - main = "apex_ddpg_pendulum_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/apex_ddpg_pendulum_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - - -# Compilation Tests - -py_test( - name = "test_apex_ddpg", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_apex_ddpg.py"] -) diff --git a/rllib_contrib/apex_ddpg/README.md b/rllib_contrib/apex_ddpg/README.md deleted file mode 100644 index b36c45f85c316..0000000000000 --- a/rllib_contrib/apex_ddpg/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# APEX DDPG (Distributed Prioritized Experience Replay) - -[APEX DDPG](https://arxiv.org/pdf/1803.00933.pdf) Distributed Prioritized Experience Replay is an algorithm that decouples -active learning from sampling. Actors interact with their own instances of the environment by selecting actions according -to a shared neural network, and accumulate the resulting experience in a shared experience replay memory; the learner replays samples of experience and updates the neural network. The architecture relies on prioritized experience replay to -focus only on the most significant data generated by the actors. - -## Installation - -``` -conda create -n rllib-apex-ddpg python=3.10 -conda activate rllib-apex-ddpg -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[APEX-DDPG Example]() \ No newline at end of file diff --git a/rllib_contrib/apex_ddpg/examples/apex_ddpg_pendulum_v1.py b/rllib_contrib/apex_ddpg/examples/apex_ddpg_pendulum_v1.py deleted file mode 100644 index 5c49a7f477756..0000000000000 --- a/rllib_contrib/apex_ddpg/examples/apex_ddpg_pendulum_v1.py +++ /dev/null @@ -1,49 +0,0 @@ -import argparse - -from rllib_apex_ddpg.apex_ddpg import ApexDDPG, ApexDDPGConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - ApexDDPGConfig() - .rollouts(num_rollout_workers=3) - .framework("torch") - .environment("Pendulum-v1", clip_rewards=False) - .training(n_step=1, target_network_update_freq=50000, tau=1.0, use_huber=True) - .evaluation(evaluation_interval=5, evaluation_duration=10) - ) - - stop_reward = -320 - - tuner = tune.Tuner( - ApexDDPG, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 1500000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/apex_ddpg/pyproject.toml b/rllib_contrib/apex_ddpg/pyproject.toml deleted file mode 100644 index 35f91d20e585d..0000000000000 --- a/rllib_contrib/apex_ddpg/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-apex-ddpg" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium[atari]", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/apex_ddpg/requirements.txt b/rllib_contrib/apex_ddpg/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/apex_ddpg/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/apex_ddpg/src/rllib_apex_ddpg/apex_ddpg/__init__.py b/rllib_contrib/apex_ddpg/src/rllib_apex_ddpg/apex_ddpg/__init__.py deleted file mode 100644 index e5149ae594c31..0000000000000 --- a/rllib_contrib/apex_ddpg/src/rllib_apex_ddpg/apex_ddpg/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from rllib_apex_ddpg.apex_ddpg.apex_ddpg import ApexDDPG, ApexDDPGConfig - -from ray.tune.registry import register_trainable - -__all__ = ["ApexDDPGConfig", "ApexDDPG"] - -register_trainable("rllib-contrib-apex-ddpg", ApexDDPG) diff --git a/rllib_contrib/apex_ddpg/src/rllib_apex_ddpg/apex_ddpg/apex_ddpg.py b/rllib_contrib/apex_ddpg/src/rllib_apex_ddpg/apex_ddpg/apex_ddpg.py deleted file mode 100644 index 06f4c82cf60e0..0000000000000 --- a/rllib_contrib/apex_ddpg/src/rllib_apex_ddpg/apex_ddpg/apex_ddpg.py +++ /dev/null @@ -1,147 +0,0 @@ -from typing import Optional - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.apex_dqn.apex_dqn import ApexDQN -from ray.rllib.algorithms.ddpg.ddpg import DDPG, DDPGConfig -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE -from ray.rllib.utils.typing import ResultDict - - -class ApexDDPGConfig(DDPGConfig): - """Defines a configuration class from which an ApexDDPG Trainer can be built. - - Example: - >>> from ray.rllib.algorithms.apex_ddpg.apex_ddpg import ApexDDPGConfig - >>> config = ApexDDPGConfig().training(lr=0.01).resources(num_gpus=1) - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Trainer object from the config and run one training iteration. - >>> algo = config.build(env="Pendulum-v1") - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.apex_ddpg.apex_ddpg import ApexDDPGConfig - >>> from ray import tune - >>> import ray.air as air - >>> config = ApexDDPGConfig() - >>> # Print out some default values. - >>> print(config.lr) # doctest: +SKIP - 0.0004 - >>> # Update the config object. - >>> config.training(lr=tune.grid_search([0.001, 0.0001])) - >>> # Set the config object's env. - >>> config.environment(env="Pendulum-v1") - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "APEX_DDPG", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes an ApexDDPGConfig instance.""" - super().__init__(algo_class=algo_class or ApexDDPG) - - # fmt: off - # __sphinx_doc_begin__ - # ApexDDPG-specific settings. - self.optimizer = { - "max_weight_sync_delay": 400, - "num_replay_buffer_shards": 4, - "debug": False, - } - # Overwrite the default max_requests_in_flight_per_replay_worker. - self.max_requests_in_flight_per_replay_worker = float("inf") - self.timeout_s_sampler_manager = 0.0 - self.timeout_s_replay_manager = 0.0 - - # Override some of Trainer/DDPG's default values with ApexDDPG-specific values. - self.n_step = 3 - self.exploration_config = {"type": "PerWorkerOrnsteinUhlenbeckNoise"} - self.num_gpus = 0 - self.num_rollout_workers = 32 - self.min_sample_timesteps_per_iteration = 25000 - self.min_time_s_per_iteration = 30 - self.train_batch_size = 512 - self.rollout_fragment_length = 50 - self.replay_buffer_config = { - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 2000000, - "no_local_replay_buffer": True, - # Alpha parameter for prioritized replay buffer. - "prioritized_replay_alpha": 0.6, - # Beta parameter for sampling from prioritized replay buffer. - "prioritized_replay_beta": 0.4, - # Epsilon to add to the TD errors when updating priorities. - "prioritized_replay_eps": 1e-6, - # Whether all shards of the replay buffer must be co-located - # with the learner process (running the execution plan). - # This is preferred b/c the learner process should have quick - # access to the data from the buffer shards, avoiding network - # traffic each time samples from the buffer(s) are drawn. - # Set this to False for relaxing this constraint and allowing - # replay shards to be created on node(s) other than the one - # on which the learner is located. - "replay_buffer_shards_colocated_with_driver": True, - # Whether to compute priorities on workers. - "worker_side_prioritization": True, - # Specify prioritized replay by supplying a buffer type that supports - # prioritization, for example: MultiAgentPrioritizedReplayBuffer. - "prioritized_replay": DEPRECATED_VALUE, - } - # Number of timesteps to collect from rollout workers before we start - # sampling from replay buffers for learning. Whether we count this in agent - # steps or environment steps depends on config.multi_agent(count_steps_by=..). - self.num_steps_sampled_before_learning_starts = 50000 - self.target_network_update_freq = 500000 - self.training_intensity = 1 - # __sphinx_doc_end__ - # fmt: on - - @override(DDPGConfig) - def training( - self, - *, - timeout_s_sampler_manager: Optional[float] = NotProvided, - timeout_s_replay_manager: Optional[float] = NotProvided, - **kwargs, - ) -> "ApexDDPGConfig": - """Sets the training related configuration. - - Args: - timeout_s_sampler_manager: The timeout for waiting for sampling results - for workers -- typically if this is too low, the manager won't be able - to retrieve ready sampling results. - timeout_s_replay_manager: The timeout for waiting for replay worker - results -- typically if this is too low, the manager won't be able to - retrieve ready replay requests. - - Returns: - This updated ApexDDPGConfig object. - """ - super().training(**kwargs) - - if timeout_s_sampler_manager is not NotProvided: - self.timeout_s_sampler_manager = timeout_s_sampler_manager - if timeout_s_replay_manager is not NotProvided: - self.timeout_s_replay_manager = timeout_s_replay_manager - - return self - - -class ApexDDPG(DDPG, ApexDQN): - @classmethod - @override(DDPG) - def get_default_config(cls) -> AlgorithmConfig: - return ApexDDPGConfig() - - @override(DDPG) - def setup(self, config: AlgorithmConfig): - return ApexDQN.setup(self, config) - - @override(DDPG) - def training_step(self) -> ResultDict: - """Use APEX-DQN's training iteration function.""" - return ApexDQN.training_step(self) diff --git a/rllib_contrib/apex_ddpg/tests/test_apex_ddpg.py b/rllib_contrib/apex_ddpg/tests/test_apex_ddpg.py deleted file mode 100644 index af2cb1863bb34..0000000000000 --- a/rllib_contrib/apex_ddpg/tests/test_apex_ddpg.py +++ /dev/null @@ -1,67 +0,0 @@ -import unittest - -import pytest -from rllib_apex_ddpg.apex_ddpg.apex_ddpg import ApexDDPGConfig - -import ray -from ray.rllib.utils.test_utils import ( - check, - check_compute_single_action, - check_train_results, - framework_iterator, -) - - -class TestApexDDPG(unittest.TestCase): - def setUp(self): - ray.init() - - def tearDown(self): - ray.shutdown() - - def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self): - """Test whether APEX-DDPG can be built on all frameworks.""" - config = ( - ApexDDPGConfig() - .environment(env="Pendulum-v1") - .rollouts(num_rollout_workers=2) - .reporting(min_sample_timesteps_per_iteration=100) - .training( - num_steps_sampled_before_learning_starts=0, - optimizer={"num_replay_buffer_shards": 1}, - ) - ) - - num_iterations = 1 - - for _ in framework_iterator(config, with_eager_tracing=True): - algo = config.build() - - # Test per-worker scale distribution. - infos = algo.workers.foreach_policy(lambda p, _: p.get_exploration_state()) - scale = [i["cur_scale"] for i in infos] - expected = [ - 0.4 ** (1 + (i + 1) / float(config.num_rollout_workers - 1) * 7) - for i in range(config.num_rollout_workers) - ] - check(scale, [0.0] + expected) - - for _ in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - - # Test again per-worker scale distribution - # (should not have changed). - infos = algo.workers.foreach_policy(lambda p, _: p.get_exploration_state()) - scale = [i["cur_scale"] for i in infos] - check(scale, [0.0] + expected) - - algo.stop() - - -if __name__ == "__main__": - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/apex_ddpg/tuned_examples/__init__.py b/rllib_contrib/apex_ddpg/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/apex_ddpg/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml b/rllib_contrib/apex_ddpg/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml deleted file mode 100644 index cd9bf73d5b8e8..0000000000000 --- a/rllib_contrib/apex_ddpg/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# This can be expected to reach 90 reward within ~1.5-2.5m timesteps / ~150-250 seconds on a K40 GPU -mountaincarcontinuous-apex-ddpg: - env: MountainCarContinuous-v0 - run: APEX_DDPG - stop: - sampler_results/episode_reward_mean: 90 - config: - # Works for both torch and tf. - framework: torch - clip_rewards: False - num_workers: 16 - exploration_config: - ou_base_scale: 1.0 - n_step: 3 - target_network_update_freq: 50000 - tau: 1.0 - evaluation_interval: 5 - evaluation_duration: 10 diff --git a/rllib_contrib/apex_ddpg/tuned_examples/pendulum-apex-ddpg.yaml b/rllib_contrib/apex_ddpg/tuned_examples/pendulum-apex-ddpg.yaml deleted file mode 100644 index 57160e9e3c732..0000000000000 --- a/rllib_contrib/apex_ddpg/tuned_examples/pendulum-apex-ddpg.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# This can be expected to reach -160 reward within 2.5 timesteps / ~250 seconds on a K40 GPU -pendulum-apex-ddpg: - env: Pendulum-v1 - run: APEX_DDPG - stop: - sampler_results/episode_reward_mean: -160 - config: - # Works for both torch and tf. - framework: torch - use_huber: True - clip_rewards: False - num_workers: 16 - n_step: 1 - target_network_update_freq: 50000 - tau: 1.0 - evaluation_interval: 5 - evaluation_duration: 10 diff --git a/rllib_contrib/apex_dqn/BUILD b/rllib_contrib/apex_dqn/BUILD deleted file mode 100644 index 1d5de3af1e23d..0000000000000 --- a/rllib_contrib/apex_dqn/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -# Examples - -py_test( - name = "example_apex_dqn_cartpole_v1", - main = "apex_dqn_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/apex_dqn_cartpole_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -# Compilation Tests - -py_test( - name = "test_apex_dqn", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_apex_dqn.py"] -) diff --git a/rllib_contrib/apex_dqn/README.md b/rllib_contrib/apex_dqn/README.md deleted file mode 100644 index fa52a56cfc5c5..0000000000000 --- a/rllib_contrib/apex_dqn/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# APEX DQN (Distributed Prioritized Experience Replay) - -[APEX DQN](https://arxiv.org/pdf/1803.00933.pdf) Distributed Prioritized Experience Replay is an algorithm that decouples -active learning from sampling. Actors interact with their own instances of the environment by selecting actions according -to a shared neural network, and accumulate the resulting experience in a shared experience replay memory; the learner replays samples of experience and updates the neural network. The architecture relies on prioritized experience replay to -focus only on the most significant data generated by the actors. - -## Installation - -``` -conda create -n rllib-apex-dqn python=3.10 -conda activate rllib-apex-dqn -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[APEX-DQN Example]() \ No newline at end of file diff --git a/rllib_contrib/apex_dqn/examples/apex_dqn_cartpole_v1.py b/rllib_contrib/apex_dqn/examples/apex_dqn_cartpole_v1.py deleted file mode 100644 index 10e7cbcb53fe1..0000000000000 --- a/rllib_contrib/apex_dqn/examples/apex_dqn_cartpole_v1.py +++ /dev/null @@ -1,58 +0,0 @@ -import argparse - -from rllib_apex_dqn.apex_dqn import ApexDQN, ApexDQNConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - ApexDQNConfig() - .rollouts(num_rollout_workers=3) - .environment("CartPole-v1") - .training( - replay_buffer_config={ - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 20000, - }, - num_steps_sampled_before_learning_starts=1000, - optimizer={"num_replay_buffer_shards": 2}, - target_network_update_freq=500, - training_intensity=4, - ) - .resources(num_gpus=0) - .reporting(min_sample_timesteps_per_iteration=1000, min_time_s_per_iteration=5) - ) - - stop_reward = 150.0 - - tuner = tune.Tuner( - ApexDQN, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 250000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/apex_dqn/pyproject.toml b/rllib_contrib/apex_dqn/pyproject.toml deleted file mode 100644 index fdf271baf997d..0000000000000 --- a/rllib_contrib/apex_dqn/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-apex-dqn" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium[atari]", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/apex_dqn/requirements.txt b/rllib_contrib/apex_dqn/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/apex_dqn/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/apex_dqn/src/rllib_apex_dqn/apex_dqn/__init__.py b/rllib_contrib/apex_dqn/src/rllib_apex_dqn/apex_dqn/__init__.py deleted file mode 100644 index bd6a6f4ba71a2..0000000000000 --- a/rllib_contrib/apex_dqn/src/rllib_apex_dqn/apex_dqn/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from rllib_apex_dqn.apex_dqn.apex_dqn import ApexDQN, ApexDQNConfig - -from ray.tune.registry import register_trainable - -__all__ = ["ApexDQNConfig", "ApexDQN"] - -register_trainable("rllib-contrib-apex-dqn", ApexDQN) diff --git a/rllib_contrib/apex_dqn/src/rllib_apex_dqn/apex_dqn/apex_dqn.py b/rllib_contrib/apex_dqn/src/rllib_apex_dqn/apex_dqn/apex_dqn.py deleted file mode 100644 index 868292f81be03..0000000000000 --- a/rllib_contrib/apex_dqn/src/rllib_apex_dqn/apex_dqn/apex_dqn.py +++ /dev/null @@ -1,755 +0,0 @@ -""" -Distributed Prioritized Experience Replay (Ape-X) -================================================= - -This file defines a DQN algorithm using the Ape-X architecture. - -Ape-X uses a single GPU learner and many CPU workers for experience collection. -Experience collection can scale to hundreds of CPU workers due to the -distributed prioritization of experience prior to storage in replay buffers. - -Detailed documentation: -https://docs.ray.io/en/master/rllib-algorithms.html#distributed-prioritized-experience-replay-ape-x -""" # noqa: E501 -import copy -import platform -import random -from collections import defaultdict -from typing import Any, Dict, List, Optional, Tuple, Union - -import ray -from ray._private.dict import merge_dicts -from ray.actor import ActorHandle -from ray.rllib.algorithms import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.dqn.dqn import DQN, DQNConfig -from ray.rllib.algorithms.dqn.learner_thread import LearnerThread -from ray.rllib.evaluation.rollout_worker import RolloutWorker -from ray.rllib.evaluation.worker_set import handle_remote_call_result_errors -from ray.rllib.utils.actor_manager import FaultTolerantActorManager -from ray.rllib.utils.actors import create_colocated_actors -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE -from ray.rllib.utils.metrics import ( - LAST_TARGET_UPDATE_TS, - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, - NUM_TARGET_UPDATES, - SAMPLE_TIMER, - SYNCH_WORKER_WEIGHTS_TIMER, - TARGET_NET_UPDATE_TIMER, -) -from ray.rllib.utils.typing import ( - PartialAlgorithmConfigDict, - ResultDict, - SampleBatchType, -) -from ray.tune.execution.placement_groups import PlacementGroupFactory -from ray.tune.trainable import Trainable - - -class ApexDQNConfig(DQNConfig): - """Defines a configuration class from which an ApexDQN Algorithm can be built. - - Example: - >>> from ray.rllib.algorithms.apex_dqn.apex_dqn import ApexDQNConfig - >>> config = ApexDQNConfig() - >>> print(config.replay_buffer_config) # doctest: +SKIP - >>> replay_config = config.replay_buffer_config.update( # doctest: +SKIP - ... { - ... "capacity": 100000, - ... "prioritized_replay_alpha": 0.45, - ... "prioritized_replay_beta": 0.55, - ... "prioritized_replay_eps": 3e-6, - ... } - ... ) - >>> config = config.training(replay_buffer_config=replay_config) #doctest: +SKIP - >>> config = config.resources(num_gpus=1) # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=30) # doctest: +SKIP - >>> config = config.environment("CartPole-v1") # doctest: +SKIP - >>> algo = config.build() # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.apex_dqn.apex_dqn import ApexDQNConfig - >>> from ray import air - >>> from ray import tune - >>> config = ApexDQNConfig() - >>> config.training( # doctest: +SKIP - ... num_atoms=tune.grid_search(list(range(1, 11))) - >>> config.environment(env="CartPole-v1") # doctest: +SKIP - >>> tune.Tuner( # doctest: +SKIP - ... "APEX", - ... run_config=air.RunConfig(stop={"episode_reward_mean":200}), - ... param_space=config.to_dict() - ... ).fit() - - Example: - >>> from ray.rllib.algorithms.apex_dqn.apex_dqn import ApexDQNConfig - >>> config = ApexDQNConfig() - >>> print(config.exploration_config) # doctest: +SKIP - >>> explore_config = config.exploration_config.update( # doctest: +SKIP - ... { - ... "type": "EpsilonGreedy", - ... "initial_epsilon": 0.96, - ... "final_epsilon": 0.01, - ... "epsilone_timesteps": 5000, - ... } - ... ) - >>> config = config.training( # doctest: +SKIP - ... lr_schedule=[[1, 1e-3, [500, 5e-3]] - ... ) - >>> config = config.exploration( # doctest: +SKIP - ... exploration_config=explore_config - ... ) - - Example: - >>> from ray.rllib.algorithms.apex_dqn.apex_dqn import ApexDQNConfig - >>> config = ApexDQNConfig() - >>> print(config.exploration_config) # doctest: +SKIP - >>> explore_config = config.exploration_config.update( # doctest: +SKIP - ... { - ... "type": "SoftQ", - ... "temperature": [1.0], - ... } - ... ) - >>> config = config.training( # doctest: +SKIP - ... lr_schedule=[[1, 1e-3, [500, 5e-3]] - ... ) - >>> config = config.exploration( # doctest: +SKIP - ... exploration_config=explore_config - ... ) - """ - - def __init__(self, algo_class=None): - """Initializes a ApexConfig instance.""" - super().__init__(algo_class=algo_class or ApexDQN) - - # fmt: off - # __sphinx_doc_begin__ - # APEX-DQN settings overriding DQN ones: - # .training() - self.optimizer = merge_dicts( - DQNConfig().optimizer, { - "max_weight_sync_delay": 400, - "num_replay_buffer_shards": 4, - "debug": False - }) - self.n_step = 3 - self.train_batch_size = 512 - self.target_network_update_freq = 500000 - self.training_intensity = 1 - # Number of timesteps to collect from rollout workers before we start - # sampling from replay buffers for learning. Whether we count this in agent - # steps or environment steps depends on config.multi_agent(count_steps_by=..). - self.num_steps_sampled_before_learning_starts = 50000 - - self.max_requests_in_flight_per_replay_worker = float("inf") - self.timeout_s_sampler_manager = 0.0 - self.timeout_s_replay_manager = 0.0 - # APEX-DQN is using a distributed (non local) replay buffer. - self.replay_buffer_config = { - "no_local_replay_buffer": True, - # Specify prioritized replay by supplying a buffer type that supports - # prioritization - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 2000000, - # Alpha parameter for prioritized replay buffer. - "prioritized_replay_alpha": 0.6, - # Beta parameter for sampling from prioritized replay buffer. - "prioritized_replay_beta": 0.4, - # Epsilon to add to the TD errors when updating priorities. - "prioritized_replay_eps": 1e-6, - # Whether all shards of the replay buffer must be co-located - # with the learner process (running the execution plan). - # This is preferred b/c the learner process should have quick - # access to the data from the buffer shards, avoiding network - # traffic each time samples from the buffer(s) are drawn. - # Set this to False for relaxing this constraint and allowing - # replay shards to be created on node(s) other than the one - # on which the learner is located. - "replay_buffer_shards_colocated_with_driver": True, - "worker_side_prioritization": True, - # Deprecated key. - "prioritized_replay": DEPRECATED_VALUE, - } - - # .rollouts() - self.num_rollout_workers = 32 - self.rollout_fragment_length = 50 - self.exploration_config = { - "type": "PerWorkerEpsilonGreedy", - } - - # .resources() - self.num_gpus = 1 - - # .reporting() - self.min_time_s_per_iteration = 30 - self.min_sample_timesteps_per_iteration = 25000 - - # fmt: on - # __sphinx_doc_end__ - - def training( - self, - *, - max_requests_in_flight_per_replay_worker: Optional[int] = NotProvided, - timeout_s_sampler_manager: Optional[float] = NotProvided, - timeout_s_replay_manager: Optional[float] = NotProvided, - **kwargs, - ) -> "ApexDQNConfig": - """Sets the training related configuration. - - Args: - num_atoms: Number of atoms for representing the distribution of return. - When this is greater than 1, distributional Q-learning is used. - v_min: Minimum value estimation - v_max: Maximum value estimation - noisy: Whether to use noisy network to aid exploration. This adds parametric - noise to the model weights. - sigma0: Control the initial parameter noise for noisy nets. - dueling: Whether to use dueling DQN. - hiddens: Dense-layer setup for each the advantage branch and the value - branch - double_q: Whether to use double DQN. - n_step: N-step for Q-learning. - before_learn_on_batch: Callback to run before learning on a multi-agent - batch of experiences. - training_intensity: The intensity with which to update the model (vs - collecting samples from the env). - If None, uses "natural" values of: - `train_batch_size` / (`rollout_fragment_length` x `num_workers` x - `num_envs_per_worker`). - If not None, will make sure that the ratio between timesteps inserted - into and sampled from the buffer matches the given values. - Example: - training_intensity=1000.0 - train_batch_size=250 - rollout_fragment_length=1 - num_workers=1 (or 0) - num_envs_per_worker=1 - -> natural value = 250 / 1 = 250.0 - -> will make sure that replay+train op will be executed 4x asoften as - rollout+insert op (4 * 250 = 1000). - See: rllib/algorithms/dqn/dqn.py::calculate_rr_weights for further - details. - replay_buffer_config: Replay buffer config. - Examples: - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentReplayBuffer", - "capacity": 50000, - "replay_sequence_length": 1, - } - - OR - - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 50000, - "prioritized_replay_alpha": 0.6, - "prioritized_replay_beta": 0.4, - "prioritized_replay_eps": 1e-6, - "replay_sequence_length": 1, - } - - Where - - prioritized_replay_alpha: Alpha parameter controls the degree of - prioritization in the buffer. In other words, when a buffer sample has - a higher temporal-difference error, with how much more probability - should it drawn to use to update the parametrized Q-network. 0.0 - corresponds to uniform probability. Setting much above 1.0 may quickly - result as the sampling distribution could become heavily “pointy” with - low entropy. - prioritized_replay_beta: Beta parameter controls the degree of - importance sampling which suppresses the influence of gradient updates - from samples that have higher probability of being sampled via alpha - parameter and the temporal-difference error. - prioritized_replay_eps: Epsilon parameter sets the baseline probability - for sampling so that when the temporal-difference error of a sample is - zero, there is still a chance of drawing the sample. - max_requests_in_flight_per_replay_worker: Max number of inflight requests - to each replay (shard) worker. See the FaultTolerantActorManager class - for more details. - Tuning these values is important when running experimens - with large sample batches, where there is the risk that the object store - may fill up, causing spilling of objects to disk. This can cause any - asynchronous requests to become very slow, making your experiment run - slow as well. You can inspect the object store during your experiment - via a call to ray memory on your headnode, and by using the ray - dashboard. If you're seeing that the object store is filling up, - turn down the number of remote requests in flight, or enable compression - in your experiment of timesteps. - timeout_s_sampler_manager: The timeout for waiting for sampling results - for workers -- typically if this is too low, the manager won't be able - to retrieve ready sampling results. - timeout_s_replay_manager: The timeout for waiting for replay worker - results -- typically if this is too low, the manager won't be able to - retrieve ready replay requests. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if max_requests_in_flight_per_replay_worker is not NotProvided: - self.max_requests_in_flight_per_replay_worker = ( - max_requests_in_flight_per_replay_worker - ) - if timeout_s_sampler_manager is not NotProvided: - self.timeout_s_sampler_manager = timeout_s_sampler_manager - if timeout_s_replay_manager is not NotProvided: - self.timeout_s_replay_manager = timeout_s_replay_manager - - return self - - @override(DQNConfig) - def validate(self) -> None: - if self.num_gpus > 1: - raise ValueError("`num_gpus` > 1 not yet supported for APEX-DQN!") - # Call DQN's validation method. - super().validate() - - -class ApexDQN(DQN): - @override(Trainable) - def setup(self, config: AlgorithmConfig): - super().setup(config) - - num_replay_buffer_shards = self.config.optimizer["num_replay_buffer_shards"] - - # Create copy here so that we can modify without breaking other logic - replay_actor_config = copy.deepcopy(self.config.replay_buffer_config) - - replay_actor_config["capacity"] = ( - self.config.replay_buffer_config["capacity"] // num_replay_buffer_shards - ) - - ReplayActor = ray.remote(num_cpus=0, max_restarts=-1)( - replay_actor_config["type"] - ) - - # Place all replay buffer shards on the same node as the learner - # (driver process that runs this execution plan). - if replay_actor_config["replay_buffer_shards_colocated_with_driver"]: - _replay_actors = create_colocated_actors( - actor_specs=[ # (class, args, kwargs={}, count) - ( - ReplayActor, - None, - replay_actor_config, - num_replay_buffer_shards, - ) - ], - node=platform.node(), # localhost - )[ - 0 - ] # [0]=only one item in `actor_specs`. - # Place replay buffer shards on any node(s). - else: - _replay_actors = [ - ReplayActor.remote(*replay_actor_config) - for _ in range(num_replay_buffer_shards) - ] - self._replay_actor_manager = FaultTolerantActorManager( - _replay_actors, - max_remote_requests_in_flight_per_actor=( - self.config.max_requests_in_flight_per_replay_worker - ), - ) - self._replay_req_timeout_s = self.config.timeout_s_replay_manager - self._sample_req_tiemeout_s = self.config.timeout_s_sampler_manager - self.learner_thread = LearnerThread(self.workers.local_worker()) - self.learner_thread.start() - self.steps_since_update = defaultdict(int) - weights = self.workers.local_worker().get_weights() - self.curr_learner_weights = ray.put(weights) - self.curr_num_samples_collected = 0 - self._num_ts_trained_since_last_target_update = 0 - - @classmethod - @override(DQN) - def get_default_config(cls) -> AlgorithmConfig: - return ApexDQNConfig() - - @override(Algorithm) - def _remote_worker_ids_for_metrics(self) -> List[int]: - # Tag those workers (top 1/3rd indices) that we should collect episodes from - # for metrics due to `PerWorkerEpsilonGreedy` exploration strategy. - num_remote_workers_for_metrics = self.config["num_workers"] // 3 - return self.workers.healthy_worker_ids()[-num_remote_workers_for_metrics:] - - @override(DQN) - def training_step(self) -> ResultDict: - num_samples_ready = self.get_samples_and_store_to_replay_buffers() - num_worker_samples_collected = defaultdict(int) - - for worker_id, samples_info in num_samples_ready: - self._counters[NUM_AGENT_STEPS_SAMPLED] += samples_info["agent_steps"] - self._counters[NUM_ENV_STEPS_SAMPLED] += samples_info["env_steps"] - num_worker_samples_collected[worker_id] += samples_info["agent_steps"] - - # Update the weights of the workers that returned samples. - # Only do this if there are remote workers (config["num_workers"] > 1). - # Also, only update those policies that were actually trained. - if self.workers.num_remote_workers() > 0: - self.update_workers(num_worker_samples_collected) - - # Update target network every `target_network_update_freq` sample steps. - cur_ts = self._counters[ - NUM_AGENT_STEPS_SAMPLED - if self.config.count_steps_by == "agent_steps" - else NUM_ENV_STEPS_SAMPLED - ] - - if cur_ts > self.config.num_steps_sampled_before_learning_starts: - # trigger a sample from the replay actors and enqueue operation to the - # learner thread. - self.sample_from_replay_buffer_place_on_learner_queue_non_blocking( - num_worker_samples_collected - ) - self.update_replay_sample_priority() - - # Training step done. Try to bring replay actors back to life if necessary. - # Replay actors can start fresh, so we do not need to restore any state. - self._replay_actor_manager.probe_unhealthy_actors( - timeout_seconds=self.config.worker_health_probe_timeout_s, - mark_healthy=True, - ) - - return copy.deepcopy(self.learner_thread.learner_info) - - def get_samples_and_store_to_replay_buffers(self): - # in the case the num_workers = 0 - if self.workers.num_remote_workers() <= 0: - with self._timers[SAMPLE_TIMER]: - local_sampling_worker = self.workers.local_worker() - batch = local_sampling_worker.sample() - actor_id = random.choice(self._replay_actor_manager.healthy_actor_ids()) - self._replay_actor_manager.foreach_actor( - lambda actor: actor.add(batch), - remote_actor_ids=[actor_id], - timeout_seconds=0, - ) - batch_statistics = [ - ( - 0, - { - "agent_steps": batch.agent_steps(), - "env_steps": batch.env_steps(), - }, - ) - ] - return batch_statistics - - replay_actor_manager = self._replay_actor_manager - - def remote_worker_sample_and_store(worker: RolloutWorker): - # This function is run as a remote function on sampling workers, - # and should only be used with the RolloutWorker's apply function ever. - # It is used to gather samples, and trigger the operation to store them to - # replay actors from the rollout worker instead of returning the obj - # refs for the samples to the driver process and doing the sampling - # operation on there. - _batch = worker.sample() - _actor = random.choice(replay_actor_manager.healthy_actor_ids()) - replay_actor_manager.foreach_actor( - lambda actor: actor.add(_batch), - remote_actor_ids=[_actor], - timeout_seconds=0, - ) - _batch_statistics = { - "agent_steps": _batch.agent_steps(), - "env_steps": _batch.env_steps(), - } - return _batch_statistics - - # Sample and Store in the Replay Actors on the sampling workers. - with self._timers[SAMPLE_TIMER]: - self.workers.foreach_worker_async( - func=remote_worker_sample_and_store, - healthy_only=True, - ) - num_samples_ready = self.workers.fetch_ready_async_reqs( - timeout_seconds=self._sample_req_tiemeout_s - ) - return num_samples_ready - - def update_workers(self, _num_samples_ready: Dict[ActorHandle, int]) -> int: - """Update the remote workers that have samples ready. - - Args: - _num_samples_ready: A mapping from ActorHandle (RolloutWorker) to - the number of samples returned by the remote worker. - - Returns: - The number of remote workers whose weights were updated. - """ - max_steps_weight_sync_delay = self.config.optimizer["max_weight_sync_delay"] - # Update our local copy of the weights if the learner thread has updated - # the learner worker's weights - policy_ids_updated = self.learner_thread.policy_ids_updated.copy() - self.learner_thread.policy_ids_updated.clear() - if policy_ids_updated: - weights = self.workers.local_worker().get_weights( - policies=policy_ids_updated - ) - self.curr_learner_weights = ray.put(weights) - - num_workers_updated = 0 - - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - curr_weights = self.curr_learner_weights - timestep = self._counters[ - NUM_AGENT_STEPS_TRAINED - if self.config.count_steps_by == "agent_steps" - else NUM_ENV_STEPS_TRAINED - ] - for ( - remote_sampler_worker_id, - num_samples_collected, - ) in _num_samples_ready.items(): - self.steps_since_update[ - remote_sampler_worker_id - ] += num_samples_collected - if ( - self.steps_since_update[remote_sampler_worker_id] - >= max_steps_weight_sync_delay - ): - self.workers.foreach_worker( - func=lambda w: w.set_weights( - ray.get(curr_weights), {"timestep": timestep} - ), - healthy_only=True, - local_worker=False, - timeout_seconds=0, # Do not wait for results. - ) - self.steps_since_update[remote_sampler_worker_id] = 0 - num_workers_updated += 1 - - self._counters["num_weight_syncs"] += 1 - - return num_workers_updated - - def sample_from_replay_buffer_place_on_learner_queue_non_blocking( - self, num_worker_samples_collected: Dict[ActorHandle, int] - ) -> None: - """Get samples from the replay buffer and place them on the learner queue. - - Args: - num_worker_samples_collected: A mapping from ActorHandle (RolloutWorker) to - number of samples returned by the remote worker. This is used to - implement training intensity which is the concept of triggering a - certain amount of training based on the number of samples that have - been collected since the last time that training was triggered. - - """ - - def wait_on_replay_actors() -> List[Tuple[int, SampleBatchType]]: - """Wait for the replay actors to finish sampling for timeout seconds. - - If the timeout is None, then block on the actors indefinitely. - """ - results = self._replay_actor_manager.fetch_ready_async_reqs( - timeout_seconds=self._replay_req_timeout_s - ) - handle_remote_call_result_errors( - results, self.config["ignore_worker_failures"] - ) - return [(r.actor_id, r.get()) for r in results.ignore_errors()] - - num_samples_collected = sum(num_worker_samples_collected.values()) - self.curr_num_samples_collected += num_samples_collected - # Fetch replayed batched from last round. - replay_sample_batches = wait_on_replay_actors() - if ( - self.curr_num_samples_collected >= self.config.train_batch_size - and - # There are at least 1 healthy replay actor. - self._replay_actor_manager.num_healthy_actors() > 0 - ): - training_intensity = int(self.config.training_intensity or 1) - num_requests_to_launch = ( - self.curr_num_samples_collected / self.config.train_batch_size - ) * training_intensity - num_requests_to_launch = max(1, round(num_requests_to_launch)) - - self.curr_num_samples_collected = 0 - train_batch_size = self.config.train_batch_size - healthy_worker_ids = self._replay_actor_manager.healthy_actor_ids() - - # Make num_requests_to_launch calls to the underlying replay actors. - worker_ids_to_call = [ - random.choice(healthy_worker_ids) for _ in range(num_requests_to_launch) - ] - self._replay_actor_manager.foreach_actor_async( - func=lambda actor: actor.sample(train_batch_size), - remote_actor_ids=worker_ids_to_call, - ) - # Fetch anything that is already ready. - replay_sample_batches.extend(wait_on_replay_actors()) - - # Add all the tuples of (ActorHandle, SampleBatchType) to the learner queue. - for item in replay_sample_batches: - # Setting block = True prevents the learner thread, - # the main thread, and the gpu loader threads from - # thrashing when there are more samples than the - # learner can reasonable process. - # see https://github.com/ray-project/ray/pull/26581#issuecomment-1187877674 # noqa - self.learner_thread.inqueue.put(item, block=True) - del replay_sample_batches - - def update_replay_sample_priority(self) -> None: - """Update the priorities of the sample batches with new priorities that are - computed by the learner thread. - """ - num_samples_trained_this_itr = 0 - for _ in range(self.learner_thread.outqueue.qsize()): - if self.learner_thread.is_alive(): - ( - replay_actor_id, - priority_dict, - env_steps, - agent_steps, - ) = self.learner_thread.outqueue.get(timeout=0.001) - if self.config.replay_buffer_config.get("prioritized_replay_alpha") > 0: - self._replay_actor_manager.foreach_actor( - func=lambda actor: actor.update_priorities(priority_dict), - remote_actor_ids=[replay_actor_id], - timeout_seconds=0, # Do not wait for results. - ) - num_samples_trained_this_itr += env_steps - self.update_target_networks(env_steps) - self._counters[NUM_ENV_STEPS_TRAINED] += env_steps - self._counters[NUM_AGENT_STEPS_TRAINED] += agent_steps - self.workers.local_worker().set_global_vars( - {"timestep": self._counters[NUM_ENV_STEPS_TRAINED]} - ) - else: - raise RuntimeError("The learner thread died while training") - - self._timers["learner_dequeue"] = self.learner_thread.queue_timer - self._timers["learner_grad"] = self.learner_thread.grad_timer - self._timers["learner_overall"] = self.learner_thread.overall_timer - - def update_target_networks(self, num_new_trained_samples) -> None: - """Update the target networks.""" - self._num_ts_trained_since_last_target_update += num_new_trained_samples - if ( - self._num_ts_trained_since_last_target_update - >= self.config.target_network_update_freq - ): - self._num_ts_trained_since_last_target_update = 0 - with self._timers[TARGET_NET_UPDATE_TIMER]: - to_update = self.workers.local_worker().get_policies_to_train() - self.workers.local_worker().foreach_policy_to_train( - lambda p, pid: pid in to_update and p.update_target() - ) - self._counters[NUM_TARGET_UPDATES] += 1 - self._counters[LAST_TARGET_UPDATE_TS] = self._counters[ - NUM_AGENT_STEPS_TRAINED - if self.config.count_steps_by == "agent_steps" - else NUM_ENV_STEPS_TRAINED - ] - - def _get_shard0_replay_stats(self) -> Dict[str, Any]: - """Get replay stats from the replay actor shard 0. - - The first healthy replay actor is picked to fetch stats from. - TODO(jungong) : figure out why not collecting data from all - replay actors? - - Returns: - A dictionary of replay stats. - """ - healthy_actor_ids = self._replay_actor_manager.healthy_actor_ids() - if not healthy_actor_ids: - return {} - - healthy_actor_id = healthy_actor_ids[0] - debug = self.config.optimizer.get("debug") - results = list( - self._replay_actor_manager.foreach_actor( - func=lambda actor: actor.stats(debug), - remote_actor_ids=[healthy_actor_id], - ) - ) - if not results: - return {} - if not results[0].ok: - raise results[0].get() - return results[0].get() - - @override(Algorithm) - def _compile_iteration_results(self, *args, **kwargs): - result = super()._compile_iteration_results(*args, **kwargs) - replay_stats = self._get_shard0_replay_stats() - exploration_infos_list = self.workers.foreach_policy_to_train( - lambda p, pid: {pid: p.get_exploration_state()} - ) - exploration_infos = {} - for info in exploration_infos_list: - # we're guaranteed that each info has policy ids that are unique - exploration_infos.update(info) - other_results = { - "exploration_infos": exploration_infos, - "learner_queue": self.learner_thread.learner_queue_size.stats(), - "replay_shard_0": replay_stats, - } - - result["info"].update(other_results) - return result - - @classmethod - @override(Algorithm) - def default_resource_request( - cls, - config: Union[AlgorithmConfig, PartialAlgorithmConfigDict], - ): - if isinstance(config, AlgorithmConfig): - cf: ApexDQNConfig = config - else: - cf: ApexDQNConfig = cls.get_default_config().update_from_dict(config) - - eval_config = cf.get_evaluation_config_object() - - # Return PlacementGroupFactory containing all needed resources - # (already properly defined as device bundles). - return PlacementGroupFactory( - bundles=[ - { - # Local worker + replay buffer actors. - # Force replay buffers to be on same node to maximize - # data bandwidth between buffers and the learner (driver). - # Replay buffer actors each contain one shard of the total - # replay buffer and use 1 CPU each. - "CPU": cf.num_cpus_for_local_worker - + cf.optimizer["num_replay_buffer_shards"], - "GPU": 0 if cf._fake_gpus else cf.num_gpus, - } - ] - + [ - { - # RolloutWorkers. - "CPU": cf.num_cpus_per_worker, - "GPU": cf.num_gpus_per_worker, - **cf.custom_resources_per_worker, - } - for _ in range(cf.num_rollout_workers) - ] - + ( - [ - { - # Evaluation workers. - # Note: The local eval worker is located on the driver - # CPU. - "CPU": eval_config.num_cpus_per_worker, - "GPU": eval_config.num_gpus_per_worker, - **eval_config.custom_resources_per_worker, - } - for _ in range(cf.evaluation_num_workers) - ] - if cf.evaluation_interval - else [] - ), - strategy=cf.placement_strategy, - ) diff --git a/rllib_contrib/apex_dqn/tests/test_apex_dqn.py b/rllib_contrib/apex_dqn/tests/test_apex_dqn.py deleted file mode 100644 index f0185aefea37b..0000000000000 --- a/rllib_contrib/apex_dqn/tests/test_apex_dqn.py +++ /dev/null @@ -1,162 +0,0 @@ -import unittest - -import pytest -from rllib_apex_dqn.apex_dqn.apex_dqn import ApexDQNConfig - -import ray -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY -from ray.rllib.utils.test_utils import ( - check, - check_compute_single_action, - check_train_results, - framework_iterator, -) - - -class TestApexDQN(unittest.TestCase): - def setUp(self): - ray.init(num_cpus=6) - - def tearDown(self): - ray.shutdown() - - def test_apex_zero_workers(self): - config = ( - ApexDQNConfig() - .environment("CartPole-v1") - .rollouts(num_rollout_workers=0) - .resources(num_gpus=0) - .training( - num_steps_sampled_before_learning_starts=0, - optimizer={ - "num_replay_buffer_shards": 1, - }, - ) - .reporting( - min_sample_timesteps_per_iteration=100, - min_time_s_per_iteration=1, - ) - ) - - for _ in framework_iterator(config): - algo = config.build() - results = algo.train() - check_train_results(results) - print(results) - algo.stop() - - def test_apex_dqn_compilation_and_per_worker_epsilon_values(self): - """Test whether APEXDQN can be built on all frameworks.""" - config = ( - ApexDQNConfig() - .environment("CartPole-v1") - .rollouts(num_rollout_workers=3) - .resources(num_gpus=0) - .training( - num_steps_sampled_before_learning_starts=0, - optimizer={ - "num_replay_buffer_shards": 1, - }, - ) - .reporting( - min_sample_timesteps_per_iteration=100, - min_time_s_per_iteration=1, - ) - ) - - for _ in framework_iterator(config, with_eager_tracing=True): - algo = config.build() - - # Test per-worker epsilon distribution. - infos = algo.workers.foreach_policy(lambda p, _: p.get_exploration_state()) - expected = [0.4, 0.016190862, 0.00065536] - check([i["cur_epsilon"] for i in infos], [0.0] + expected) - - check_compute_single_action(algo) - - for i in range(2): - results = algo.train() - check_train_results(results) - print(results) - - # Test again per-worker epsilon distribution - # (should not have changed). - infos = algo.workers.foreach_policy(lambda p, _: p.get_exploration_state()) - check([i["cur_epsilon"] for i in infos], [0.0] + expected) - - algo.stop() - - def test_apex_lr_schedule(self): - config = ( - ApexDQNConfig() - .environment("CartPole-v1") - .rollouts( - num_rollout_workers=1, - rollout_fragment_length=5, - ) - .resources(num_gpus=0) - .training( - train_batch_size=10, - optimizer={ - "num_replay_buffer_shards": 1, - # This makes sure learning schedule is checked every 10 timesteps. - "max_weight_sync_delay": 10, - }, - replay_buffer_config={ - "no_local_replay_buffer": True, - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 100, - "prioritized_replay_alpha": 0.6, - # Beta parameter for sampling from prioritized replay buffer. - "prioritized_replay_beta": 0.4, - # Epsilon to add to the TD errors when updating priorities. - "prioritized_replay_eps": 1e-6, - }, - # Initial lr, doesn't really matter because of the schedule below. - lr=0.2, - lr_schedule=[[0, 0.2], [100, 0.001]], - # Number of timesteps to collect from rollout workers before we start - # sampling from replay buffers for learning. - num_steps_sampled_before_learning_starts=10, - ) - .reporting( - min_sample_timesteps_per_iteration=10, - # Make sure that results contain info on default policy - min_train_timesteps_per_iteration=10, - # 0 metrics reporting delay, this makes sure timestep, - # which lr depends on, is updated after each worker rollout. - min_time_s_per_iteration=0, - ) - ) - - def _step_n_times(algo, n: int): - """Step trainer n times. - - Returns: - learning rate at the end of the execution. - """ - for _ in range(n): - results = algo.train() - return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ - "cur_lr" - ] - - for _ in framework_iterator(config, frameworks=("torch", "tf")): - algo = config.build() - - lr = _step_n_times(algo, 3) # 50 timesteps - # Close to 0.2 - self.assertLessEqual(lr, 0.2) - - lr = _step_n_times(algo, 20) # 200 timesteps - # LR Annealed to 0.001 - self.assertLessEqual(lr, 0.0011) - - algo.stop() - - -if __name__ == "__main__": - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/apex_dqn/tuned_examples/__init__.py b/rllib_contrib/apex_dqn/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/apex_dqn/tuned_examples/atari-apex-dqn.yaml b/rllib_contrib/apex_dqn/tuned_examples/atari-apex-dqn.yaml deleted file mode 100644 index c6e8254c2cf24..0000000000000 --- a/rllib_contrib/apex_dqn/tuned_examples/atari-apex-dqn.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -apex-breakoutnoframeskip-v5: - env: ALE/Breakout-v5 - run: APEX - # Minimum reward and total ts (in given time_total_s) to pass this test. - stop: - time_total_s: 7200 - sampler_results/episode_reward_mean: 20.0 - timesteps_total: 7000000 - config: - # Make analogous to old v4 + NoFrameskip. - env_config: - frameskip: 1 - full_action_space: false - repeat_action_probability: 0.0 - double_q: false - dueling: false - num_atoms: 1 - noisy: false - n_step: 3 - lr: .0001 - adam_epsilon: .00015 - hiddens: [512] - replay_buffer_config: - capacity: 1000000 - prioritized_replay_alpha: 0.5 - exploration_config: - epsilon_timesteps: 200000 - final_epsilon: 0.01 - num_gpus: 1 - num_workers: 8 - num_envs_per_worker: 8 - rollout_fragment_length: 20 - train_batch_size: 512 - target_network_update_freq: 50000 - min_sample_timesteps_per_iteration: 25000 diff --git a/rllib_contrib/apex_dqn/tuned_examples/cartpole-apex-dqn-fake-gpus.yaml b/rllib_contrib/apex_dqn/tuned_examples/cartpole-apex-dqn-fake-gpus.yaml deleted file mode 100644 index 48ccf00741a64..0000000000000 --- a/rllib_contrib/apex_dqn/tuned_examples/cartpole-apex-dqn-fake-gpus.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# Note here that with < 3 workers, APEX can behave a little unstably -# due to the (static) per-worker-epsilon distribution, which also makes -# evaluation w/o evaluation worker set harder. -# For an epsilon-free/greedy evaluation, use: -# evaluation_interval: 1 -# evaluation_config: -# explore: False -cartpole-apex-dqn: - env: CartPole-v1 - run: APEX - stop: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 250000 - config: - # Works for both torch and tf. - framework: torch - num_workers: 3 - optimizer: - num_replay_buffer_shards: 1 - - # Fake 2 GPUs. - num_gpus: 2 - _fake_gpus: true diff --git a/rllib_contrib/apex_dqn/tuned_examples/cartpole-apex-dqn.yaml b/rllib_contrib/apex_dqn/tuned_examples/cartpole-apex-dqn.yaml deleted file mode 100644 index 63682c766ad5d..0000000000000 --- a/rllib_contrib/apex_dqn/tuned_examples/cartpole-apex-dqn.yaml +++ /dev/null @@ -1,34 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# Note here that with < 3 workers, APEX can behave a little unstably -# due to the (static) per-worker-epsilon distribution, which also makes -# evaluation w/o evaluation worker set harder. -# For an epsilon-free/greedy evaluation, use: -# evaluation_interval: 1 -# evaluation_config: -# explore: False -cartpole-apex-dqn-training-itr: - env: CartPole-v1 - run: APEX - stop: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 250000 - config: - # Works for both torch and tf. - framework: torch - # Make this work with only 5 CPUs and 0 GPUs: - num_workers: 3 - optimizer: - num_replay_buffer_shards: 2 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - capacity: 20000 - num_steps_sampled_before_learning_starts: 1000 - - num_gpus: 0 - - min_time_s_per_iteration: 5 - target_network_update_freq: 500 - min_sample_timesteps_per_iteration: 1000 - training_intensity: 4 \ No newline at end of file diff --git a/rllib_contrib/apex_dqn/tuned_examples/pong-apex-dqn.yaml b/rllib_contrib/apex_dqn/tuned_examples/pong-apex-dqn.yaml deleted file mode 100644 index eb7b8e03d4b34..0000000000000 --- a/rllib_contrib/apex_dqn/tuned_examples/pong-apex-dqn.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# This reaches ~19 reward in < 40 minutes (3M env steps) on a p3.8xlarge AWS instance. -# See https://app.wandb.ai/zplizzi/test/runs/ayuuhixr?workspace=user-zplizzi -# for training curves. -pong-apex: - env: ALE/Pong-v5 - run: APEX - stop: - sampler_results/episode_reward_mean: 19.0 - timesteps_total: 4000000 - config: - # Works for both torch and tf. - framework: torch - # Make analogous to old v4 + NoFrameskip. - env_config: - frameskip: 1 - full_action_space: false - repeat_action_probability: 0.0 - target_network_update_freq: 20000 - num_workers: 4 - num_envs_per_worker: 8 - lr: .00005 - train_batch_size: 64 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - capacity: 1000000 - # we should set compress_observations to True because few machines - # would be able to contain the replay buffers in memory otherwise - compress_observations: True - gamma: 0.99 - training_intensity: 16 diff --git a/rllib_contrib/ars/BUILD b/rllib_contrib/ars/BUILD deleted file mode 100644 index a82158cbc9ac3..0000000000000 --- a/rllib_contrib/ars/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# Examples - -py_test( - name = "example_ars_cartpole_v1", - main = "ars_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/ars_cartpole_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_cartpole_ars", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib"], - size = "medium", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-ars.yaml"], - args = ["--dir=ars/tuned_examples/"] -) - -# Compilation Tests - -py_test( - name = "test_ars", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_ars.py"] -) diff --git a/rllib_contrib/ars/README.md b/rllib_contrib/ars/README.md deleted file mode 100644 index 0f587c9e1fde9..0000000000000 --- a/rllib_contrib/ars/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# ARS (Asynchronous Advantage Actor-Critic) - -[ARS](https://arxiv.org/abs/1803.07055) ARS is a random search method for training linear policies for continuous control problems. Code here is adapted from https://github.com/modestyachts/ARS to integrate with RLlib APIs. - - -## Installation - -``` -conda create -n rllib-ars python=3.10 -conda activate rllib-ars -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[ARS Example]() \ No newline at end of file diff --git a/rllib_contrib/ars/examples/ars_cartpole_v1.py b/rllib_contrib/ars/examples/ars_cartpole_v1.py deleted file mode 100644 index bf635a5f074ea..0000000000000 --- a/rllib_contrib/ars/examples/ars_cartpole_v1.py +++ /dev/null @@ -1,57 +0,0 @@ -import argparse - -from rllib_ars.ars import ARS, ARSConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - ARSConfig() - .rollouts(num_rollout_workers=2) - .framework("torch") - .environment("CartPole-v1") - .training( - noise_stdev=0.02, - rollouts_used=23, - num_rollouts=50, - sgd_stepsize=0.01, - noise_size=250000000, - eval_prob=0.5, - ) - ) - - stop_reward = 150 - - tuner = tune.Tuner( - ARS, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 1000000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved( - results, stop_reward, metric="sampler_results/episode_reward_mean" - ) diff --git a/rllib_contrib/ars/pyproject.toml b/rllib_contrib/ars/pyproject.toml deleted file mode 100644 index c5a44d58d468e..0000000000000 --- a/rllib_contrib/ars/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-ars" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/ars/requirements.txt b/rllib_contrib/ars/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/ars/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/ars/src/rllib_ars/ars/__init__.py b/rllib_contrib/ars/src/rllib_ars/ars/__init__.py deleted file mode 100644 index 096f174a1014c..0000000000000 --- a/rllib_contrib/ars/src/rllib_ars/ars/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from rllib_ars.ars.ars import ARS, ARSConfig -from rllib_ars.ars.ars_tf_policy import ARSTFPolicy -from rllib_ars.ars.ars_torch_policy import ARSTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = ["ARSConfig", "ARS", "ARSTorchPolicy", "ARSTFPolicy"] - -register_trainable("rllib-contrib-ars", ARS) diff --git a/rllib_contrib/ars/src/rllib_ars/ars/ars.py b/rllib_contrib/ars/src/rllib_ars/ars/ars.py deleted file mode 100644 index 3999269285990..0000000000000 --- a/rllib_contrib/ars/src/rllib_ars/ars/ars.py +++ /dev/null @@ -1,624 +0,0 @@ -# Code in this file is copied and adapted from -# https://github.com/openai/evolution-strategies-starter and from -# https://github.com/modestyachts/ARS - -import logging -import random -import time -from collections import namedtuple -from typing import Optional - -import numpy as np -from rllib_ars.ars.ars_tf_policy import ARSTFPolicy - -import ray -from ray.rllib.algorithms import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.es import optimizers, utils -from ray.rllib.algorithms.es.es_tf_policy import rollout -from ray.rllib.env.env_context import EnvContext -from ray.rllib.evaluation.worker_set import WorkerSet -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils import FilterManager -from ray.rllib.utils.actor_manager import FaultAwareApply -from ray.rllib.utils.annotations import override -from ray.rllib.utils.metrics import ( - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, -) -from ray.rllib.utils.torch_utils import set_torch_seed - -logger = logging.getLogger(__name__) - -Result = namedtuple( - "Result", - [ - "noise_indices", - "noisy_returns", - "sign_noisy_returns", - "noisy_lengths", - "eval_returns", - "eval_lengths", - ], -) - - -class ARSConfig(AlgorithmConfig): - """Defines a configuration class from which an ARS Algorithm can be built. - - Example: - >>> from ray.rllib.algorithms.ars import ARSConfig - >>> config = ARSConfig() # doctest: +SKIP - >>> config = config.training(report_length=20) # doctest: +SKIP - >>> config = config.resources(num_gpus=0) # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=4) # doctest: +SKIP - >>> config = config.environment("CartPole-v1") # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build() # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.ars import ARSConfig - >>> from ray import air - >>> from ray import tune - >>> config = ARSConfig() - >>> # Print out some default values. - >>> print(config.action_noise_std) # doctest: +SKIP - >>> # Update the config object. - >>> config = config.training( # doctest: +SKIP - ... rollouts_used=tune.grid_search([32, 64]), eval_prob=0.5) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "ARS", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self): - """Initializes a ARSConfig instance.""" - super().__init__(algo_class=ARS) - - # fmt: off - # __sphinx_doc_begin__ - - # ARS specific settings: - self.action_noise_std = 0.0 - self.noise_stdev = 0.02 - self.num_rollouts = 32 - self.rollouts_used = 32 - self.sgd_stepsize = 0.01 - self.noise_size = 250000000 - self.eval_prob = 0.03 - self.report_length = 10 - self.offset = 0 - self.tf_single_threaded = True - - # Override some of AlgorithmConfig's default values with ARS-specific values. - self.num_rollout_workers = 2 - self.observation_filter = "MeanStdFilter" - - # ARS will use Algorithm's evaluation WorkerSet (if evaluation_interval > 0). - # Therefore, we must be careful not to use more than 1 env per eval worker - # (would break ARSPolicy's compute_single_action method) and to not do - # obs-filtering. - self.evaluation( - evaluation_config=AlgorithmConfig.overrides( - num_envs_per_worker=1, - observation_filter="NoFilter", - ) - ) - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # __sphinx_doc_end__ - # fmt: on - - @override(AlgorithmConfig) - def training( - self, - *, - action_noise_std: Optional[float] = NotProvided, - noise_stdev: Optional[float] = NotProvided, - num_rollouts: Optional[int] = NotProvided, - rollouts_used: Optional[int] = NotProvided, - sgd_stepsize: Optional[float] = NotProvided, - noise_size: Optional[int] = NotProvided, - eval_prob: Optional[float] = NotProvided, - report_length: Optional[int] = NotProvided, - offset: Optional[int] = NotProvided, - tf_single_threaded: Optional[bool] = NotProvided, - **kwargs, - ) -> "ARSConfig": - """Sets the training related configuration. - - Args: - action_noise_std: Std. deviation to be used when adding (standard normal) - noise to computed actions. Action noise is only added, if - `compute_actions` is called with the `add_noise` arg set to True. - noise_stdev: Std. deviation of parameter noise. - num_rollouts: Number of perturbs to try. - rollouts_used: Number of perturbs to keep in gradient estimate. - sgd_stepsize: SGD step-size used for the Adam optimizer. - noise_size: Number of rows in the noise table (shared across workers). - Each row contains a gaussian noise value for each model parameter. - eval_prob: Probability of evaluating the parameter rewards. - report_length: How many of the last rewards we average over. - offset: Value to subtract from the reward (e.g. survival bonus - from humanoid) during rollouts. - tf_single_threaded: Whether the tf-session should be generated without any - parallelism options. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if action_noise_std is not NotProvided: - self.action_noise_std = action_noise_std - if noise_stdev is not NotProvided: - self.noise_stdev = noise_stdev - if num_rollouts is not NotProvided: - self.num_rollouts = num_rollouts - if rollouts_used is not NotProvided: - self.rollouts_used = rollouts_used - if sgd_stepsize is not NotProvided: - self.sgd_stepsize = sgd_stepsize - if noise_size is not NotProvided: - self.noise_size = noise_size - if eval_prob is not NotProvided: - self.eval_prob = eval_prob - if report_length is not NotProvided: - self.report_length = report_length - if offset is not NotProvided: - self.offset = offset - if tf_single_threaded is not NotProvided: - self.tf_single_threaded = tf_single_threaded - - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if self.num_gpus > 1: - raise ValueError("`num_gpus` > 1 not yet supported for ARS!") - if self.num_rollout_workers <= 0: - raise ValueError("`num_rollout_workers` must be > 0 for ARS!") - if ( - self.evaluation_config is not None - and self.evaluation_config.get("num_envs_per_worker") != 1 - ): - raise ValueError( - "`evaluation_config.num_envs_per_worker` must always be 1 for " - "ARS! To parallelize evaluation, increase " - "`evaluation_num_workers` to > 1." - ) - if ( - self.evaluation_config is not None - and self.evaluation_config.get("observation_filter") != "NoFilter" - ): - raise ValueError( - "`evaluation_config.observation_filter` must always be " - "`NoFilter` for ARS!" - ) - - -@ray.remote -def create_shared_noise(count): - """Create a large array of noise to be shared by all workers.""" - seed = 123 - noise = np.random.RandomState(seed).randn(count).astype(np.float32) - return noise - - -class SharedNoiseTable: - def __init__(self, noise): - self.noise = noise - assert self.noise.dtype == np.float32 - - def get(self, i, dim): - return self.noise[i : i + dim] - - def sample_index(self, dim): - return np.random.randint(0, len(self.noise) - dim + 1) - - def get_delta(self, dim): - idx = self.sample_index(dim) - return idx, self.get(idx, dim) - - -@ray.remote(max_restarts=-1) -class Worker(FaultAwareApply): - def __init__( - self, - config: AlgorithmConfig, - env_creator, - noise, - worker_index, - min_task_runtime=0.2, - ): - # Set Python random, numpy, env, and torch/tf seeds. - seed = config.seed - if seed is not None: - # Python random module. - random.seed(seed) - # Numpy. - np.random.seed(seed) - # Torch. - if config.framework_str == "torch": - set_torch_seed(seed) - - self.min_task_runtime = min_task_runtime - self.config = config - self.noise = SharedNoiseTable(noise) - - env_context = EnvContext(self.config.env_config, worker_index) - self.env = env_creator(env_context) - # Seed the env, if gym.Env. - if not hasattr(self.env, "seed"): - logger.info("Env doesn't support env.seed(): {}".format(self.env)) - # Gym.env. - else: - self.env.seed(seed) - - from ray.rllib import models - - self.preprocessor = models.ModelCatalog.get_preprocessor(self.env) - - policy_cls = get_policy_class(self.config) - self.policy = policy_cls( - self.env.observation_space, self.env.action_space, config.to_dict() - ) - - @property - def filters(self): - return {DEFAULT_POLICY_ID: self.policy.observation_filter} - - def sync_filters(self, new_filters): - for k in self.filters: - self.filters[k].sync(new_filters[k]) - - def get_filters(self, flush_after=False): - return_filters = {} - for k, f in self.filters.items(): - return_filters[k] = f.as_serializable() - if flush_after: - f.reset_buffer() - return return_filters - - def rollout(self, timestep_limit, add_noise=False): - rollout_rewards, rollout_fragment_length = rollout( - self.policy, - self.env, - timestep_limit=timestep_limit, - add_noise=add_noise, - offset=self.config.offset, - ) - return rollout_rewards, rollout_fragment_length - - def do_rollouts(self, params, timestep_limit=None): - # Set the network weights. - self.policy.set_flat_weights(params) - - noise_indices, returns, sign_returns, lengths = [], [], [], [] - eval_returns, eval_lengths = [], [] - - # Perform some rollouts with noise. - while len(noise_indices) == 0: - if np.random.uniform() < self.config.eval_prob: - # Do an evaluation run with no perturbation. - self.policy.set_flat_weights(params) - rewards, length = self.rollout(timestep_limit, add_noise=False) - eval_returns.append(rewards.sum()) - eval_lengths.append(length) - else: - # Do a regular run with parameter perturbations. - noise_index = self.noise.sample_index(self.policy.num_params) - - perturbation = self.config.noise_stdev * self.noise.get( - noise_index, self.policy.num_params - ) - - # These two sampling steps could be done in parallel on - # different actors letting us update twice as frequently. - self.policy.set_flat_weights(params + perturbation) - rewards_pos, lengths_pos = self.rollout(timestep_limit) - - self.policy.set_flat_weights(params - perturbation) - rewards_neg, lengths_neg = self.rollout(timestep_limit) - - noise_indices.append(noise_index) - returns.append([rewards_pos.sum(), rewards_neg.sum()]) - sign_returns.append( - [np.sign(rewards_pos).sum(), np.sign(rewards_neg).sum()] - ) - lengths.append([lengths_pos, lengths_neg]) - - return Result( - noise_indices=noise_indices, - noisy_returns=returns, - sign_noisy_returns=sign_returns, - noisy_lengths=lengths, - eval_returns=eval_returns, - eval_lengths=eval_lengths, - ) - - def stop(self): - """Releases all resources used by this RolloutWorker.""" - pass - - -def get_policy_class(config: AlgorithmConfig): - if config.framework_str == "torch": - from rllib_ars.ars.ars_torch_policy import ARSTorchPolicy - - policy_cls = ARSTorchPolicy - else: - policy_cls = ARSTFPolicy - return policy_cls - - -class ARS(Algorithm): - """Large-scale implementation of Augmented Random Search in Ray.""" - - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return ARSConfig() - - @override(Algorithm) - def setup(self, config: AlgorithmConfig): - # Setup our config: Merge the user-supplied config (which could - # be a partial config dict with the class' default). - if isinstance(config, dict): - self.config = self.get_default_config().update_from_dict(config) - - # Validate our config dict. - self.config.validate() - - # Generate the local env. - env_context = EnvContext(self.config.env_config or {}, worker_index=0) - env = self.env_creator(env_context) - - self.callbacks = self.config.callbacks_class() - - self._policy_class = get_policy_class(self.config) - self.policy = self._policy_class( - env.observation_space, env.action_space, self.config - ) - self.optimizer = optimizers.SGD(self.policy, self.config.sgd_stepsize) - - self.rollouts_used = self.config.rollouts_used - self.num_rollouts = self.config.num_rollouts - self.report_length = self.config.report_length - - # Create the shared noise table. - logger.info("Creating shared noise table.") - noise_id = create_shared_noise.remote(self.config.noise_size) - self.noise = SharedNoiseTable(ray.get(noise_id)) - - # Create the actors. - logger.info("Creating actors.") - - remote_workers = [ - Worker.remote(self.config, self.env_creator, noise_id, idx + 1) - for idx in range(self.config.num_rollout_workers) - ] - self.workers = WorkerSet._from_existing( - local_worker=None, - remote_workers=remote_workers, - ) - - self.episodes_so_far = 0 - self.reward_list = [] - self.tstart = time.time() - - @override(Algorithm) - def get_policy(self, policy=DEFAULT_POLICY_ID): - if policy != DEFAULT_POLICY_ID: - raise ValueError( - "ARS has no policy '{}'! Use {} " - "instead.".format(policy, DEFAULT_POLICY_ID) - ) - return self.policy - - @override(Algorithm) - def step(self): - config = self.config - - theta = self.policy.get_flat_weights() - assert theta.dtype == np.float32 - assert len(theta.shape) == 1 - - # Put the current policy weights in the object store. - theta_id = ray.put(theta) - # Use the actors to do rollouts, note that we pass in the ID of the - # policy weights. - results, num_episodes, num_timesteps = self._collect_results( - theta_id, config["num_rollouts"] - ) - # Update our sample steps counters. - self._counters[NUM_AGENT_STEPS_SAMPLED] += num_timesteps - self._counters[NUM_ENV_STEPS_SAMPLED] += num_timesteps - - all_noise_indices = [] - all_training_returns = [] - all_training_lengths = [] - all_eval_returns = [] - all_eval_lengths = [] - - # Loop over the results. - for result in results: - all_eval_returns += result.eval_returns - all_eval_lengths += result.eval_lengths - - all_noise_indices += result.noise_indices - all_training_returns += result.noisy_returns - all_training_lengths += result.noisy_lengths - - assert len(all_eval_returns) == len(all_eval_lengths) - assert ( - len(all_noise_indices) - == len(all_training_returns) - == len(all_training_lengths) - ) - - self.episodes_so_far += num_episodes - - # Assemble the results. - eval_returns = np.array(all_eval_returns) - eval_lengths = np.array(all_eval_lengths) - noise_indices = np.array(all_noise_indices) - noisy_returns = np.array(all_training_returns) - noisy_lengths = np.array(all_training_lengths) - - # keep only the best returns - # select top performing directions if rollouts_used < num_rollouts - max_rewards = np.max(noisy_returns, axis=1) - if self.rollouts_used > self.num_rollouts: - self.rollouts_used = self.num_rollouts - - percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts)) - idx = np.arange(max_rewards.size)[ - max_rewards >= np.percentile(max_rewards, percentile) - ] - noise_idx = noise_indices[idx] - noisy_returns = noisy_returns[idx, :] - - # Compute and take a step. - g, count = utils.batched_weighted_sum( - noisy_returns[:, 0] - noisy_returns[:, 1], - (self.noise.get(index, self.policy.num_params) for index in noise_idx), - batch_size=min(500, noisy_returns[:, 0].size), - ) - g /= noise_idx.size - # scale the returns by their standard deviation - if not np.isclose(np.std(noisy_returns), 0.0): - g /= np.std(noisy_returns) - assert g.shape == (self.policy.num_params,) and g.dtype == np.float32 - # Compute the new weights theta. - theta, update_ratio = self.optimizer.update(-g) - - # Update our train steps counters. - self._counters[NUM_AGENT_STEPS_TRAINED] += num_timesteps - self._counters[NUM_ENV_STEPS_TRAINED] += num_timesteps - - # Set the new weights in the local copy of the policy. - self.policy.set_flat_weights(theta) - # update the reward list - if len(all_eval_returns) > 0: - self.reward_list.append(eval_returns.mean()) - - # Bring restored workers back if necessary. - self.restore_workers(self.workers) - - # Now sync the filters - FilterManager.synchronize( - {DEFAULT_POLICY_ID: self.policy.observation_filter}, self.workers - ) - - info = { - "weights_norm": np.square(theta).sum(), - "weights_std": np.std(theta), - "grad_norm": np.square(g).sum(), - "update_ratio": update_ratio, - "episodes_this_iter": noisy_lengths.size, - "episodes_so_far": self.episodes_so_far, - } - - reward_mean = np.mean(self.reward_list[-self.report_length :]) - result = { - "sampler_results": { - "episode_reward_mean": reward_mean, - "episode_len_mean": eval_lengths.mean(), - }, - "timesteps_this_iter": noisy_lengths.sum(), - "info": info, - } - - return result - - @override(Algorithm) - def cleanup(self): - self.workers.stop() - - @override(Algorithm) - def restore_workers(self, workers: WorkerSet): - restored = self.workers.probe_unhealthy_workers() - if restored: - self._sync_weights_to_workers(worker_set=self.workers, worker_ids=restored) - - @override(Algorithm) - def compute_single_action(self, observation, *args, **kwargs): - action, _, _ = self.policy.compute_actions([observation], update=True) - if kwargs.get("full_fetch"): - return action[0], [], {} - return action[0] - - @override(Algorithm) - def _sync_weights_to_workers(self, *, worker_set=None, worker_ids=None): - # Broadcast the new policy weights to all evaluation workers. - assert worker_set is not None - logger.info("Synchronizing weights to evaluation workers.") - weights = ray.put(self.policy.get_flat_weights()) - worker_set.foreach_worker( - lambda w: w.foreach_policy( - lambda p, _: p.set_flat_weights(ray.get(weights)) - ), - local_worker=False, - remote_worker_ids=worker_ids, - ) - - def _collect_results(self, theta_id, min_episodes): - num_episodes, num_timesteps = 0, 0 - results = [] - while num_episodes < min_episodes: - logger.debug( - "Collected {} episodes {} timesteps so far this iter".format( - num_episodes, num_timesteps - ) - ) - rollout_ids = self.workers.foreach_worker( - func=lambda w: w.do_rollouts(ray.get(theta_id)), - local_worker=False, - ) - # Get the results of the rollouts. - for result in rollout_ids: - results.append(result) - # Update the number of episodes and the number of timesteps - # keeping in mind that result.noisy_lengths is a list of lists, - # where the inner lists have length 2. - num_episodes += sum(len(pair) for pair in result.noisy_lengths) - num_timesteps += sum(sum(pair) for pair in result.noisy_lengths) - - return results, num_episodes, num_timesteps - - def __getstate__(self): - return { - "weights": self.policy.get_flat_weights(), - "filter": self.policy.observation_filter, - "episodes_so_far": self.episodes_so_far, - } - - def __setstate__(self, state): - self.episodes_so_far = state["episodes_so_far"] - self.policy.set_flat_weights(state["weights"]) - self.policy.observation_filter = state["filter"] - FilterManager.synchronize( - {DEFAULT_POLICY_ID: self.policy.observation_filter}, self.workers - ) diff --git a/rllib_contrib/ars/src/rllib_ars/ars/ars_tf_policy.py b/rllib_contrib/ars/src/rllib_ars/ars/ars_tf_policy.py deleted file mode 100644 index fc5be5301f874..0000000000000 --- a/rllib_contrib/ars/src/rllib_ars/ars/ars_tf_policy.py +++ /dev/null @@ -1,144 +0,0 @@ -# Code in this file is copied and adapted from -# https://github.com/openai/evolution-strategies-starter. - -import gymnasium as gym -import numpy as np -import tree # pip install dm_tree - -import ray -import ray.experimental.tf_utils -from ray.rllib.algorithms.es.es_tf_policy import make_session -from ray.rllib.models import ModelCatalog -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils import deprecation_warning -from ray.rllib.utils.filter import get_filter -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.spaces.space_utils import unbatch - -tf1, tf, tfv = try_import_tf() - - -class ARSTFPolicy(Policy): - def __init__(self, obs_space, action_space, config): - super().__init__(obs_space, action_space, config) - self.action_noise_std = self.config["action_noise_std"] - self.preprocessor = ModelCatalog.get_preprocessor_for_space( - self.observation_space - ) - self.observation_filter = get_filter( - self.config["observation_filter"], self.preprocessor.shape - ) - - if self.config["framework"] == "tf": - self.sess = make_session( - single_threaded=self.config.get("tf_single_threaded", True) - ) - - # Set graph-level seed. - if config.get("seed") is not None: - with self.sess.as_default(): - tf1.set_random_seed(config["seed"]) - - self.inputs = tf1.placeholder( - tf.float32, [None] + list(self.preprocessor.shape) - ) - else: - if not tf1.executing_eagerly(): - tf1.enable_eager_execution() - self.sess = self.inputs = None - if config.get("seed") is not None: - # Non-static-graph TF. - if config.get("framework") == "tf2": - # Tf1.x. - if tf1: - tf1.set_random_seed(config["seed"]) - else: - tf.random.set_seed(config["seed"]) - - # Policy network. - self.dist_class, dist_dim = ModelCatalog.get_action_dist( - self.action_space, self.config["model"], dist_type="deterministic" - ) - - self.model = ModelCatalog.get_model_v2( - obs_space=self.preprocessor.observation_space, - action_space=self.action_space, - num_outputs=dist_dim, - model_config=self.config["model"], - ) - - self.sampler = None - if self.sess: - dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs}) - dist = self.dist_class(dist_inputs, self.model) - self.sampler = dist.sample() - self.variables = ray.experimental.tf_utils.TensorFlowVariables( - dist_inputs, self.sess - ) - self.sess.run(tf1.global_variables_initializer()) - else: - self.variables = ray.experimental.tf_utils.TensorFlowVariables( - [], None, self.model.variables() - ) - - self.num_params = sum( - np.prod(variable.shape.as_list()) - for _, variable in self.variables.variables.items() - ) - - def compute_actions(self, obs_batch=None, add_noise=False, update=True, **kwargs): - if "observation" in kwargs: - assert obs_batch is None, ( - "You can not use both arguments, " - "`observation` and `obs_batch`. `observation` " - "is deprecated." - ) - deprecation_warning( - old="ARSTFPolicy.compute_actions(observation=...)`", - new="ARSTFPolicy.compute_actions(obs_batch=...)", - ) - obs_batch = kwargs["observation"] - else: - assert obs_batch is not None - # Squeeze batch dimension (we always calculate actions for only a - # single obs). - observation = obs_batch[0] - observation = self.preprocessor.transform(observation) - observation = self.observation_filter(observation[None], update=update) - - # `actions` is a list of (component) batches. - # Eager mode. - if not self.sess: - dist_inputs, _ = self.model({SampleBatch.CUR_OBS: observation}) - dist = self.dist_class(dist_inputs, self.model) - actions = dist.sample() - actions = tree.map_structure(lambda a: a.numpy(), actions) - # Graph mode. - else: - actions = self.sess.run(self.sampler, feed_dict={self.inputs: observation}) - - actions = unbatch(actions) - if add_noise and isinstance(self.action_space, gym.spaces.Box): - actions += np.random.randn(*actions.shape) * self.action_noise_std - return actions, [], {} - - def compute_single_action( - self, observation, add_noise=False, update=True, **kwargs - ): - action, state_outs, extra_fetches = self.compute_actions( - [observation], add_noise=add_noise, update=update, **kwargs - ) - return action[0], state_outs, extra_fetches - - def get_state(self): - return {"state": self.get_flat_weights()} - - def set_state(self, state): - return self.set_flat_weights(state["state"]) - - def set_flat_weights(self, x): - self.variables.set_flat(x) - - def get_flat_weights(self): - return self.variables.get_flat() diff --git a/rllib_contrib/ars/src/rllib_ars/ars/ars_torch_policy.py b/rllib_contrib/ars/src/rllib_ars/ars/ars_torch_policy.py deleted file mode 100644 index b5d6497894f11..0000000000000 --- a/rllib_contrib/ars/src/rllib_ars/ars/ars_torch_policy.py +++ /dev/null @@ -1,20 +0,0 @@ -# Code in this file is adapted from: -# https://github.com/openai/evolution-strategies-starter. - -import ray -from ray.rllib.algorithms.es.es_torch_policy import ( - after_init, - before_init, - make_model_and_action_dist, -) -from ray.rllib.policy.policy_template import build_policy_class - -ARSTorchPolicy = build_policy_class( - name="ARSTorchPolicy", - framework="torch", - loss_fn=None, - get_default_config=lambda: ray.rllib.algorithms.ars.ars.ARSConfig(), - before_init=before_init, - after_init=after_init, - make_model_and_action_dist=make_model_and_action_dist, -) diff --git a/rllib_contrib/ars/tests/test_ars.py b/rllib_contrib/ars/tests/test_ars.py deleted file mode 100644 index b4ffaebf43358..0000000000000 --- a/rllib_contrib/ars/tests/test_ars.py +++ /dev/null @@ -1,51 +0,0 @@ -import unittest - -from rllib_ars.ars.ars import ARSConfig - -import ray -from ray.rllib.utils.test_utils import check_compute_single_action, framework_iterator - - -class TestARS(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_ars_compilation(self): - """Test whether an ARSAlgorithm can be built on all frameworks.""" - config = ARSConfig() - - # Keep it simple. - config.training( - model={ - "fcnet_hiddens": [10], - "fcnet_activation": None, - }, - noise_size=2500000, - ) - # Test eval workers ("normal" WorkerSet, unlike ARS' list of - # RolloutWorkers used for collecting train batches). - config.evaluation(evaluation_interval=1, evaluation_num_workers=1) - - num_iterations = 2 - - for _ in framework_iterator(config): - algo = config.build(env="CartPole-v1") - for i in range(num_iterations): - results = algo.train() - print(results) - - check_compute_single_action(algo) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/ars/tuned_examples/__init__.py b/rllib_contrib/ars/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/ars/tuned_examples/cartpole-ars.yaml b/rllib_contrib/ars/tuned_examples/cartpole-ars.yaml deleted file mode 100644 index 53ed748cd0031..0000000000000 --- a/rllib_contrib/ars/tuned_examples/cartpole-ars.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-ars: - env: CartPole-v1 - run: ARS - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 1000000 - config: - # Works for both torch and tf. - framework: torch - noise_stdev: 0.02 - num_rollouts: 50 - rollouts_used: 25 - num_workers: 2 - sgd_stepsize: 0.01 - noise_size: 25000000 - eval_prob: 0.5 diff --git a/rllib_contrib/ars/tuned_examples/swimmer-ars.yaml b/rllib_contrib/ars/tuned_examples/swimmer-ars.yaml deleted file mode 100644 index e7cfc87b663f7..0000000000000 --- a/rllib_contrib/ars/tuned_examples/swimmer-ars.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# can expect improvement to -140 reward in ~300-500k timesteps -swimmer-ars: - env: Swimmer-v2 - run: ARS - config: - # Works for both torch and tf. - framework: torch - noise_stdev: 0.01 - num_rollouts: 1 - rollouts_used: 1 - num_workers: 1 - sgd_stepsize: 0.02 - noise_size: 250000000 - eval_prob: 0.2 - offset: 0 - observation_filter: NoFilter - report_length: 3 - model: - fcnet_hiddens: [] # a linear policy diff --git a/rllib_contrib/bandit/BUILD b/rllib_contrib/bandit/BUILD deleted file mode 100644 index 917ec4d6959f3..0000000000000 --- a/rllib_contrib/bandit/BUILD +++ /dev/null @@ -1,54 +0,0 @@ -# Examples - -py_test( - name = "example_bandit_linucb_interest_evolution_recsim", - main = "bandit_linucb_interest_evolution_recsim.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/bandit_linucb_interest_evolution_recsim.py"], - args = ["--run-as-test"] -) - -py_test( - name = "examples/lin_ts_train_wheel_env", - main = "examples/lin_ts_train_wheel_env.py", - tags = ["team:rllib", "examples"], - size = "small", - srcs = ["examples/lin_ts_train_wheel_env.py"], -) - -py_test( - name = "examples/tune_lin_ts_train_wheel_env", - main = "examples/tune_lin_ts_train_wheel_env.py", - tags = ["team:rllib", "examples"], - size = "small", - srcs = ["examples/tune_lin_ts_train_wheel_env.py"], -) - -py_test( - name = "examples/tune_lin_ucb_train_recommendation", - main = "examples/tune_lin_ucb_train_recommendation.py", - tags = ["team:rllib", "examples"], - size = "small", - srcs = ["examples/tune_lin_ucb_train_recommendation.py"], -) - -py_test( - name = "examples/tune_lin_ucb_train_recsim_env", - main = "examples/tune_lin_ucb_train_recsim_env.py", - tags = ["team:rllib", "examples"], - size = "small", - srcs = ["examples/tune_lin_ucb_train_recsim_env.py"], -) - -# Learning Tests - - -# Compilation Tests - -py_test( - name = "test_bandit", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_bandit.py"] -) diff --git a/rllib_contrib/bandit/README.md b/rllib_contrib/bandit/README.md deleted file mode 100644 index 95f7a971bba1b..0000000000000 --- a/rllib_contrib/bandit/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# Contextual Bandits - -The Multi-armed bandit (MAB) problem provides a simplified RL setting that -involves learning to act under one situation only, i.e. the context (observation/state) and arms (actions/items-to-select) are both fixed. -Contextual bandit is an extension of the MAB problem, where at each -round the agent has access not only to a set of bandit arms/actions but also -to a context (state) associated with this iteration. The context changes -with each iteration, but, is not affected by the action that the agent takes. -The objective of the agent is to maximize the cumulative rewards, by -collecting enough information about how the context and the rewards of the -arms are related to each other. The agent does this by balancing the -trade-off between exploration and exploitation. - -Contextual bandit algorithms typically consist of an action-value model (Q -model) and an exploration strategy (epsilon-greedy, LinUCB, Thompson Sampling etc.) - -RLlib supports the following online contextual bandit algorithms, -named after the exploration strategies that they employ: - - -## Linear Upper Confidence Bound (BanditLinUCB) -[paper](http://rob.schapire.net/papers/www10.pdf) - -LinUCB assumes a linear dependency between the expected reward of an action and -its context. It estimates the Q value of each action using ridge regression. -It constructs a confidence region around the weights of the linear -regression model and uses this confidence ellipsoid to estimate the -uncertainty of action values. - - -## Linear Thompson Sampling (BanditLinTS) -[paper](http://proceedings.mlr.press/v28/agrawal13.pdf) - -Like LinUCB, LinTS also assumes a linear dependency between the expected -reward of an action and its context and uses online ridge regression to -estimate the Q values of actions given the context. It assumes a Gaussian -prior on the weights and a Gaussian likelihood function. For deciding which -action to take, the agent samples weights for each arm, using -the posterior distributions, and plays the arm that produces the highest reward. - -## Installation - -``` -conda create -n rllib-bandit python=3.10 -conda activate rllib-bandit -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage diff --git a/rllib_contrib/bandit/examples/bandit_linucb_interest_evolution_recsim.py b/rllib_contrib/bandit/examples/bandit_linucb_interest_evolution_recsim.py deleted file mode 100644 index b9852013ba1af..0000000000000 --- a/rllib_contrib/bandit/examples/bandit_linucb_interest_evolution_recsim.py +++ /dev/null @@ -1,65 +0,0 @@ -import argparse - -from rllib_bandit.bandit import BanditLinUCB, BanditLinUCBConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init(local_mode=True) - - config = ( - BanditLinUCBConfig() - .framework("torch") - .environment( - "ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv", # noqa - env_config={ - "config": { - # Each step, sample `num_candidates` documents using the - # env-internal - # document sampler model (a logic that creates n documents to select - # the slate from). - "resample_documents": True, - "num_candidates": 100, - # How many documents to recommend (out of `num_candidates`) each - # timestep - "slate_size": 2, - "convert_to_discrete_action_space": True, - "wrap_for_bandits": True, - } - }, - ) - .reporting(metrics_num_episodes_for_smoothing=500) - .debugging(seed=0) - ) - - stop_reward = 180 - - tuner = tune.Tuner( - BanditLinUCB, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 1500000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/bandit/examples/lin_ts_train_wheel_env.py b/rllib_contrib/bandit/examples/lin_ts_train_wheel_env.py deleted file mode 100644 index e40a57326829a..0000000000000 --- a/rllib_contrib/bandit/examples/lin_ts_train_wheel_env.py +++ /dev/null @@ -1,69 +0,0 @@ -""" Example of using Linear Thompson Sampling on WheelBandit environment. - For more information on WheelBandit, see https://arxiv.org/abs/1802.09127 . -""" - -import argparse - -import numpy as np -from matplotlib import pyplot as plt -from rllib_bandit.bandit import BanditLinTSConfig - -from ray.rllib.examples.env.bandit_envs_discrete import WheelBanditEnv -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO - - -def plot_model_weights(means, covs): - fmts = ["bo", "ro", "yx", "k+", "gx"] - labels = ["arm{}".format(i) for i in range(5)] - - fig, ax = plt.subplots(figsize=(6, 4)) - - ax.set_title("Weights distributions of arms") - - for i in range(0, 5): - x, y = np.random.multivariate_normal(means[i] / 30, covs[i], 5000).T - ax.plot(x, y, fmts[i], label=labels[i]) - - ax.grid(True, which="both") - ax.axhline(y=0, color="k") - ax.axvline(x=0, color="k") - ax.legend(loc="best") - plt.show() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--framework", - choices=["tf2", "torch"], - default="torch", - help="The DL framework specifier.", - ) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - - num_iter = 10 - print("Running training for %s time steps" % num_iter) - config = BanditLinTSConfig().environment(WheelBanditEnv).framework(args.framework) - algo = config.build() - - policy = algo.get_policy() - model = policy.model - - print("Using exploration strategy:", policy.exploration) - print("Using model:", model) - - for i in range(num_iter): - algo.train() - - info = algo.train() - print(info["info"][LEARNER_INFO]) - - # Get model parameters - means = [model.arms[i].theta.numpy() for i in range(5)] - covs = [model.arms[i].covariance.numpy() for i in range(5)] - - # Plot weight distributions for different arms - plot_model_weights(means, covs) - - algo.stop() diff --git a/rllib_contrib/bandit/examples/tune_lin_ts_train_wheel_env.py b/rllib_contrib/bandit/examples/tune_lin_ts_train_wheel_env.py deleted file mode 100644 index e36d68a817577..0000000000000 --- a/rllib_contrib/bandit/examples/tune_lin_ts_train_wheel_env.py +++ /dev/null @@ -1,101 +0,0 @@ -""" Example of using Linear Thompson Sampling on WheelBandit environment. - For more information on WheelBandit, see https://arxiv.org/abs/1802.09127 . -""" - -import argparse -import time - -import numpy as np -from matplotlib import pyplot as plt -from rllib_bandit.bandit import BanditLinTSConfig - -import ray -from ray import air, tune -from ray.rllib.examples.env.bandit_envs_discrete import WheelBanditEnv - - -def plot_model_weights(means, covs, ax): - fmts = ["bo", "ro", "yx", "k+", "gx"] - labels = ["arm{}".format(i) for i in range(5)] - - ax.set_title("Weights distributions of arms") - - for i in range(0, 5): - x, y = np.random.multivariate_normal(means[i] / 30, covs[i], 5000).T - ax.plot(x, y, fmts[i], label=labels[i]) - - ax.set_aspect("equal") - ax.grid(True, which="both") - ax.axhline(y=0, color="k") - ax.axvline(x=0, color="k") - ax.legend(loc="best") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--framework", - choices=["tf2", "torch"], - default="torch", - help="The DL framework specifier.", - ) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - - ray.init(num_cpus=2) - - config = BanditLinTSConfig().environment(WheelBanditEnv).framework(args.framework) - - # Actual env steps per `train()` call will be - # 10 * `min_sample_timesteps_per_iteration` (100 by default) = 1,000 - training_iterations = 10 - - print("Running training for %s time steps" % training_iterations) - - start_time = time.time() - tuner = tune.Tuner( - "BanditLinTS", - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={"training_iteration": training_iterations}, - checkpoint_config=air.CheckpointConfig( - checkpoint_at_end=True, - ), - ), - tune_config=tune.TuneConfig( - num_samples=1, - ), - ) - results = tuner.fit() - - print("The trials took", time.time() - start_time, "seconds\n") - - # Analyze cumulative regrets of the trials. - # There is only one trial - result = results.get_best_result() - x = result.metrics_dataframe.groupby("agent_timesteps_total")[ - "episode_reward_mean" - ].aggregate(["mean", "max", "min", "std"]) - - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4)) - - ax1.plot(x["mean"]) - - ax1.set_title("Episode reward mean") - ax1.set_xlabel("Training steps") - - # Restore Algorithm from checkpoint - checkpoint = results.get_best_result().checkpoint - algo = config.build() - with checkpoint.as_directory() as checkpoint_dir: - algo.restore(checkpoint_dir) - - # Get model to plot arm weights distribution - model = algo.get_policy().model - means = [model.arms[i].theta.numpy() for i in range(5)] - covs = [model.arms[i].covariance.numpy() for i in range(5)] - - # Plot weight distributions for different arms - plot_model_weights(means, covs, ax2) - fig.tight_layout() - plt.show() diff --git a/rllib_contrib/bandit/examples/tune_lin_ucb_train_recommendation.py b/rllib_contrib/bandit/examples/tune_lin_ucb_train_recommendation.py deleted file mode 100644 index 4f4984d652738..0000000000000 --- a/rllib_contrib/bandit/examples/tune_lin_ucb_train_recommendation.py +++ /dev/null @@ -1,108 +0,0 @@ -""" Example of using LinUCB on a recommendation environment with parametric - actions. """ - -import argparse -import os -import time - -import pandas as pd -from matplotlib import pyplot as plt -from rllib_bandit.bandit import BanditLinUCBConfig - -import ray -from ray import air, tune -from ray.rllib.env.wrappers.recsim import ( - MultiDiscreteToDiscreteActionWrapper, - RecSimObservationBanditWrapper, -) -from ray.rllib.examples.env.bandit_envs_recommender_system import ParametricRecSys -from ray.tune import register_env - -# Because ParametricRecSys follows RecSim's API, we have to wrap it before -# it can work with our Bandits agent. -register_env( - "ParametricRecSysEnv", - lambda cfg: MultiDiscreteToDiscreteActionWrapper( - RecSimObservationBanditWrapper(ParametricRecSys(**cfg)) - ), -) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--framework", - choices=["tf2", "torch"], - default="torch", - help="The DL framework specifier.", - ) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - - # Temp fix to avoid OMP conflict. - os.environ["KMP_DUPLICATE_LIB_OK"] = "True" - - ray.init() - - config = ( - BanditLinUCBConfig() - .environment( - "ParametricRecSysEnv", - env_config={ - "embedding_size": 20, - "num_docs_to_select_from": 10, - "slate_size": 1, - "num_docs_in_db": 100, - "num_users_in_db": 1, - "user_time_budget": 1.0, - }, - ) - .framework(args.framework) - # Test with batched inference. - .rollouts(num_envs_per_worker=2) - .evaluation( - evaluation_interval=20, - evaluation_duration=100, - evaluation_duration_unit="episodes", - ) - ) - config.simple_optimizer = True - - # Actual env timesteps per `train()` call will be - # 10 * min_sample_timesteps_per_iteration (100 by default) = 1,000. - training_iterations = 10 - - print("Running training for %s time steps" % training_iterations) - - start_time = time.time() - tuner = tune.Tuner( - "BanditLinUCB", - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={"training_iteration": training_iterations}, - checkpoint_config=air.CheckpointConfig( - checkpoint_at_end=False, - ), - ), - tune_config=tune.TuneConfig( - num_samples=2, - ), - ) - results = tuner.fit() - - print("The trials took", time.time() - start_time, "seconds\n") - - # Analyze cumulative regrets of the trials - frame = pd.DataFrame() - for result in results: - frame = pd.concat([frame, result.metrics_dataframe], ignore_index=True) - x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate( - ["mean", "max", "min", "std"] - ) - - plt.plot(x["mean"]) - plt.fill_between( - x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2 - ) - plt.title("Episode reward mean") - plt.xlabel("Training steps") - plt.show() diff --git a/rllib_contrib/bandit/examples/tune_lin_ucb_train_recsim_env.py b/rllib_contrib/bandit/examples/tune_lin_ucb_train_recsim_env.py deleted file mode 100644 index 51dcf42546a49..0000000000000 --- a/rllib_contrib/bandit/examples/tune_lin_ucb_train_recsim_env.py +++ /dev/null @@ -1,84 +0,0 @@ -"""Example of using LinUCB on a RecSim environment. """ - -import argparse -import time - -import pandas as pd -from matplotlib import pyplot as plt -from rllib_bandit.bandit import BanditLinUCBConfig - -import ray.rllib.examples.env.recommender_system_envs_with_recsim # noqa -from ray import air, tune - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--framework", - choices=["tf2", "torch"], - default="torch", - help="The DL framework specifier.", - ) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - - ray.init() - - config = ( - BanditLinUCBConfig() - # "RecSim-v1" is a pre-registered RecSim env. - # Alternatively, you can do: - # `from ray.rllib.examples.env.recommender_system_envs_with_recsim import ...` - # - LongTermSatisfactionRecSimEnv - # - InterestExplorationRecSimEnv - # - InterestEvolutionRecSimEnv - # Then: "env": [the imported RecSim class] - .environment( - "RecSim-v1", - env_config={ - "num_candidates": 10, - "slate_size": 1, - "convert_to_discrete_action_space": True, - "wrap_for_bandits": True, - }, - ).framework(args.framework) - ) - - # Actual env timesteps per `train()` call will be - # 10 * min_sample_timesteps_per_iteration (100 by default) = 1000 - training_iterations = 10 - - print("Running training for %s time steps" % training_iterations) - - start_time = time.time() - tuner = tune.Tuner( - "BanditLinUCB", - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={"training_iteration": training_iterations}, - checkpoint_config=air.CheckpointConfig( - checkpoint_at_end=False, - ), - ), - tune_config=tune.TuneConfig( - num_samples=1, - ), - ) - results = tuner.fit() - - print("The trials took", time.time() - start_time, "seconds\n") - - # Analyze cumulative regrets of the trials - frame = pd.DataFrame() - for result in results: - frame = pd.concat([frame, result.metrics_dataframe], ignore_index=True) - x = frame.groupby("agent_timesteps_total")["episode_reward_mean"].aggregate( - ["mean", "max", "min", "std"] - ) - - plt.plot(x["mean"]) - plt.fill_between( - x.index, x["mean"] - x["std"], x["mean"] + x["std"], color="b", alpha=0.2 - ) - plt.title("Episode reward mean") - plt.xlabel("Training steps") - plt.show() diff --git a/rllib_contrib/bandit/pyproject.toml b/rllib_contrib/bandit/pyproject.toml deleted file mode 100644 index bfe830e203934..0000000000000 --- a/rllib_contrib/bandit/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-bandit" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium", "ray[rllib]==2.5.0", "tensorflow-probability==0.20.1", "recsim"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.13.0", "torch==1.12.0", "matplotlib", "pandas"] diff --git a/rllib_contrib/bandit/requirements.txt b/rllib_contrib/bandit/requirements.txt deleted file mode 100644 index e934e21d8edeb..0000000000000 --- a/rllib_contrib/bandit/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -tensorflow==2.13.0 -torch==1.12.0 -tensorflow-probability==0.20.1 diff --git a/rllib_contrib/bandit/src/rllib_bandit/bandit/__init__.py b/rllib_contrib/bandit/src/rllib_bandit/bandit/__init__.py deleted file mode 100644 index 80244a7a2c57f..0000000000000 --- a/rllib_contrib/bandit/src/rllib_bandit/bandit/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from rllib_bandit.bandit.bandit import ( - BanditLinTS, - BanditLinTSConfig, - BanditLinUCB, - BanditLinUCBConfig, -) - -from ray.tune.registry import register_trainable - -__all__ = ["BanditLinTS", "BanditLinUCB", "BanditLinTSConfig", "BanditLinUCBConfig"] - -register_trainable("rllib-contrib-bandit-lin-ts", BanditLinTS) -register_trainable("rllib-contrib-bandit-lin-ucb", BanditLinUCB) diff --git a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit.py b/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit.py deleted file mode 100644 index e0d3f0d993011..0000000000000 --- a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit.py +++ /dev/null @@ -1,123 +0,0 @@ -import logging -from typing import Optional, Type, Union - -from rllib_bandit.bandit.bandit_tf_policy import BanditTFPolicy -from rllib_bandit.bandit.bandit_torch_policy import BanditTorchPolicy - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override - -logger = logging.getLogger(__name__) - - -class BanditConfig(AlgorithmConfig): - """Defines a contextual bandit configuration class from which - a contexual bandit algorithm can be built. Note this config is shared - between BanditLinUCB and BanditLinTS. You likely - want to use the child classes BanditLinTSConfig or BanditLinUCBConfig - instead. - """ - - def __init__(self, algo_class: Union["BanditLinTS", "BanditLinUCB"] = None): - super().__init__(algo_class=algo_class) - # fmt: off - # __sphinx_doc_begin__ - # Override some of AlgorithmConfig's default values with bandit-specific values. - self.framework_str = "torch" - self.rollout_fragment_length = 1 - self.train_batch_size = 1 - # Make sure, a `train()` call performs at least 100 env sampling - # timesteps, before reporting results. Not setting this (default is 0) - # would significantly slow down the Bandit Algorithm. - self.min_sample_timesteps_per_iteration = 100 - # __sphinx_doc_end__ - # fmt: on - - -class BanditLinTSConfig(BanditConfig): - """Defines a configuration class from which a Thompson-sampling bandit can be built. - - Example: - >>> from ray.rllib.algorithms.bandit import BanditLinTSConfig # doctest: +SKIP - >>> from ray.rllib.examples.env.bandit_envs_discrete import WheelBanditEnv - >>> config = BanditLinTSConfig().rollouts(num_rollout_workers=4)# doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env=WheelBanditEnv) # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - """ - - def __init__(self): - super().__init__(algo_class=BanditLinTS) - # fmt: off - # __sphinx_doc_begin__ - # Override some of AlgorithmConfig's default values with bandit-specific values. - self.exploration_config = {"type": "ThompsonSampling"} - # __sphinx_doc_end__ - # fmt: on - - -class BanditLinUCBConfig(BanditConfig): - """Defines a config class from which an upper confidence bound bandit can be built. - - Example: - >>> from ray.rllib.algorithms.bandit import BanditLinUCBConfig# doctest: +SKIP - >>> from ray.rllib.examples.env.bandit_envs_discrete import WheelBanditEnv - >>> config = BanditLinUCBConfig() # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=4) # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env=WheelBanditEnv) # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - """ - - def __init__(self): - super().__init__(algo_class=BanditLinUCB) - # fmt: off - # __sphinx_doc_begin__ - # Override some of AlgorithmConfig's default values with bandit-specific values. - self.exploration_config = {"type": "UpperConfidenceBound"} - # __sphinx_doc_end__ - # fmt: on - - -class BanditLinTS(Algorithm): - """Bandit Algorithm using ThompsonSampling exploration.""" - - @classmethod - @override(Algorithm) - def get_default_config(cls) -> BanditLinTSConfig: - return BanditLinTSConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - return BanditTorchPolicy - elif config["framework"] == "tf2": - return BanditTFPolicy - else: - raise NotImplementedError("Only `framework=[torch|tf2]` supported!") - - -class BanditLinUCB(Algorithm): - @classmethod - @override(Algorithm) - def get_default_config(cls) -> BanditLinUCBConfig: - return BanditLinUCBConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - return BanditTorchPolicy - elif config["framework"] == "tf2": - return BanditTFPolicy - else: - raise NotImplementedError("Only `framework=[torch|tf2]` supported!") diff --git a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_tf_model.py b/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_tf_model.py deleted file mode 100644 index fe42758752f53..0000000000000 --- a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_tf_model.py +++ /dev/null @@ -1,282 +0,0 @@ -import gymnasium as gym - -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf, try_import_tfp -from ray.rllib.utils.typing import TensorType - -tf1, tf, tfv = try_import_tf() -tfp = try_import_tfp() - - -class OnlineLinearRegression(tf.Module if tf else object): - def __init__(self, feature_dim, alpha=1, lambda_=1): - super(OnlineLinearRegression, self).__init__() - - self.d = feature_dim - self.delta_f = tf.zeros(self.d) - self.delta_b = tf.zeros((self.d, self.d)) - self.update_schedule = 1 - self.time = 0 - self.alpha = alpha - self.precision = tf.Variable( - initial_value=lambda_ * tf.eye(self.d), name="precision" - ) - self.f = tf.Variable(initial_value=tf.zeros(self.d), name="f") - self.covariance = tf.Variable( - initial_value=tf.linalg.inv(self.precision), name="covariance" - ) - self.theta = tf.Variable( - initial_value=tf.linalg.matvec(self.covariance, self.f), name="theta" - ) - - self._init_params() - self.dist = self._make_dist() - - def _init_params(self): - self.covariance.assign(self.covariance * self.alpha) - - def _make_dist(self): - """Create a multivariate normal distribution with the current parameters""" - dist = tfp.distributions.MultivariateNormalTriL( - self.theta, scale_tril=tf.linalg.cholesky(self.covariance) - ) - return dist - - def partial_fit(self, x, y): - x, y = self._check_inputs(x, y) - x = tf.squeeze(x, axis=0) - y = y[0] - self.time += 1 - self.delta_f += tf.cast(y, tf.float32) * x - self.delta_b += tf.tensordot(x, x, axes=0) - # Can follow an update schedule if not doing sherman morison updates - if self.time % self.update_schedule == 0: - self.precision.assign_add(self.delta_b) - self.f.assign_add(self.delta_f) - self.delta_f = tf.zeros(self.d) - self.delta_b = tf.zeros((self.d, self.d)) - self.covariance.assign(tf.linalg.inv(self.precision)) - self.theta.assign(tf.linalg.matvec(self.covariance, self.f)) - self.covariance.assign(self.covariance * self.alpha) - # the multivariate dist needs to be reconstructed every time - # its parameters are updated.the parameters of the dist do not - # update every time the stored self.covariance and self.theta - # (the mean) are updated. - self.dist = self._make_dist() - - def sample_theta(self): - theta = self.dist.sample() - return theta - - def get_ucbs(self, x: TensorType): - """Calculate upper confidence bounds using covariance matrix according - to algorithm 1: LinUCB - (http://proceedings.mlr.press/v15/chu11a/chu11a.pdf). - - Args: - x: Input feature tensor of shape - (batch_size, [num_items]?, feature_dim) - """ - x = tf.cast(x, dtype=tf.float32) - # Fold batch and num-items dimensions into one dim. - if len(x.shape) == 3: - B, C, F = x.shape - x_folded_batch = tf.reshape(x, [-1, F]) - # Only batch and feature dims. - else: - x_folded_batch = x - - projections = tf.linalg.matmul( - a=self.covariance, b=x_folded_batch, transpose_b=True - ) - batch_dots = tf.math.reduce_sum( - x_folded_batch * tf.transpose(projections), axis=-1 - ) - batch_dots = tf.math.sqrt(batch_dots) - - # Restore original B and C dimensions. - if len(x.shape) == 3: - batch_dots = tf.reshape(batch_dots, [B, C]) - return batch_dots - - def __call__(self, x: TensorType, sample_theta=False): - """Predict scores on input batch using the underlying linear model. - - Args: - x: Input feature tensor of shape - (batch_size, feature_dim) - sample_theta: Whether to sample the weights from its - posterior distribution to perform Thompson Sampling as per - http://proceedings.mlr.press/v28/agrawal13.pdf . - """ - x = tf.cast(x, dtype=tf.float32) - x = self._check_inputs(x) - theta = self.sample_theta() if sample_theta else self.theta - scores = tf.linalg.matvec(x, theta) - return scores - - def _check_inputs(self, x, y=None): - assert len(x.shape) in [2, 3], ( - "Input context tensor must be 2 (no batch) or 3 dimensional (where the" - " first dimension is the batch size)." - ) - assert x.shape[-1] == self.d, ( - "Feature dimensions of weights ({}) and context ({}) do not " - "match!".format(self.d, x.shape[-1]) - ) - if y is not None: - assert tf.is_tensor(y), f"ERROR: Target should be a tensor, but is {y}!" - return x if y is None else (x, y) - - -class DiscreteLinearModel(TFModelV2): - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - TFModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - - alpha = model_config.get("alpha", 1) - lambda_ = model_config.get("lambda_", 1) - self.feature_dim = obs_space.sample().size - self.arms = [ - OnlineLinearRegression( - feature_dim=self.feature_dim, alpha=alpha, lambda_=lambda_ - ) - for i in range(self.num_outputs) - ] - self._cur_value = None - self._cur_ctx = None - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"] - scores = self.predict(x) - return scores, state - - def predict(self, x, sample_theta=False, use_ucb=False): - self._cur_ctx = x - scores = tf.stack( - [self.arms[i](x, sample_theta) for i in range(self.num_outputs)], axis=-1 - ) - if use_ucb: - ucbs = tf.stack( - [self.arms[i].get_ucbs(x) for i in range(self.num_outputs)], axis=-1 - ) - scores += scores + ucbs - self._cur_value = scores - return scores - - def partial_fit(self, x, y, arms): - for i, arm in enumerate(arms): - assert ( - 0 <= arm < len(self.arms) - ), "Invalid arm: {}. It should be 0 <= arm < {}".format(arm, len(self.arms)) - xi = tf.expand_dims(x[i], axis=0) - yi = tf.expand_dims(y[i], axis=0) - self.arms[arm].partial_fit(xi, yi) - - @override(ModelV2) - def value_function(self): - assert self._cur_value is not None, "must call forward() first" - return self._cur_value - - def current_obs(self): - assert self._cur_ctx is not None, "must call forward() first" - return self._cur_ctx - - -class DiscreteLinearModelUCB(DiscreteLinearModel): - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"] - scores = super(DiscreteLinearModelUCB, self).predict( - x, sample_theta=False, use_ucb=True - ) - return scores, state - - -class DiscreteLinearModelThompsonSampling(DiscreteLinearModel): - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"] - scores = super(DiscreteLinearModelThompsonSampling, self).predict( - x, sample_theta=True, use_ucb=False - ) - return scores, state - - -class ParametricLinearModel(TFModelV2): - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - TFModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - - alpha = model_config.get("alpha", 1) - lambda_ = model_config.get("lambda_", 0.1) - - # RLlib preprocessors will flatten the observation space and unflatten - # it later. Accessing the original space here. - original_space = obs_space.original_space - assert ( - isinstance(original_space, gym.spaces.Dict) - and "item" in original_space.spaces - ), "This model only supports gym.spaces.Dict observation spaces." - self.feature_dim = original_space["item"].shape[-1] - self.arm = OnlineLinearRegression( - feature_dim=self.feature_dim, alpha=alpha, lambda_=lambda_ - ) - self._cur_value = None - self._cur_ctx = None - - def _check_inputs(self, x): - assert ( - len(x.shape) == 3 - ), f"ERROR: Inputs ({x}) must have 3 dimensions (B x num-items x features)." - return x - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"]["item"] - x = self._check_inputs(x) - scores = self.predict(x) - return scores, state - - def predict(self, x, sample_theta=False, use_ucb=False): - self._cur_ctx = x - scores = self.arm(x, sample_theta) - if use_ucb: - scores += 0.3 * self.arm.get_ucbs(x) - self._cur_value = scores - return scores - - def partial_fit(self, x, y, arms): - x = x["item"] - for i, arm in enumerate(arms): - xi = tf.expand_dims(x[i, arm], axis=0) - yi = tf.expand_dims(y[i], axis=0) - self.arm.partial_fit(xi, yi) - - @override(ModelV2) - def value_function(self): - assert self._cur_value is not None, "Must call `forward()` first." - return self._cur_value - - def current_obs(self): - assert self._cur_ctx is not None, "Must call `forward()` first." - return self._cur_ctx - - -class ParametricLinearModelUCB(ParametricLinearModel): - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"]["item"] - x = self._check_inputs(x) - scores = super().predict(x, sample_theta=False, use_ucb=True) - return scores, state - - -class ParametricLinearModelThompsonSampling(ParametricLinearModel): - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"]["item"] - x = self._check_inputs(x) - scores = super().predict(x, sample_theta=True, use_ucb=False) - return scores, state diff --git a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_tf_policy.py b/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_tf_policy.py deleted file mode 100644 index 089aa6ae341cb..0000000000000 --- a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_tf_policy.py +++ /dev/null @@ -1,159 +0,0 @@ -import logging -import time -from typing import Dict - -import gymnasium as gym -from gymnasium import spaces -from rllib_bandit.bandit.bandit_tf_model import ( - DiscreteLinearModel, - DiscreteLinearModelThompsonSampling, - DiscreteLinearModelUCB, - ParametricLinearModelThompsonSampling, - ParametricLinearModelUCB, -) - -import ray -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import restore_original_dimensions -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_policy_template import build_tf_policy -from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.rllib.utils.tf_utils import make_tf_callable -from ray.rllib.utils.typing import AlgorithmConfigDict, TensorType -from ray.util.debug import log_once - -logger = logging.getLogger(__name__) - - -class BanditPolicyOverrides: - def __init__(self): - @make_tf_callable(self.get_session()) - def learn_on_batch(postprocessed_batch) -> Dict[str, TensorType]: - # INFOS dict can't be converted to Tensor with the interceptor. - postprocessed_batch.set_get_interceptor(None) - - unflattened_obs = restore_original_dimensions( - postprocessed_batch[SampleBatch.CUR_OBS], - self.observation_space, - self.framework, - ) - - info = {} - - start = time.time() - self.model.partial_fit( - unflattened_obs, - postprocessed_batch[SampleBatch.REWARDS], - postprocessed_batch[SampleBatch.ACTIONS], - ) - - infos = postprocessed_batch[SampleBatch.INFOS] - if "regret" in infos[0]: - regret = sum( - row["infos"]["regret"] for row in postprocessed_batch.rows() - ) - self.regrets.append(regret) - info["cumulative_regret"] = sum(self.regrets) - else: - if log_once("no_regrets"): - logger.warning( - "The env did not report `regret` values in " - "its `info` return, ignoring." - ) - info["update_latency"] = time.time() - start - return {LEARNER_STATS_KEY: info} - - self.learn_on_batch = learn_on_batch - - -def validate_spaces( - policy: Policy, - observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> None: - """Validates the observation- and action spaces used for the Policy. - - Args: - policy: The policy, whose spaces are being validated. - observation_space: The observation space to validate. - action_space: The action space to validate. - config: The Policy's config dict. - - Raises: - UnsupportedSpaceException: If one of the spaces is not supported. - """ - # Only support single Box or single Discrete spaces. - if not isinstance(action_space, gym.spaces.Discrete): - msg = ( - f"Action space ({action_space}) of {policy} is not supported for " - f"Bandit algorithms. Must be `Discrete`." - ) - # Hint at using the MultiDiscrete to Discrete wrapper for Bandits. - if isinstance(action_space, gym.spaces.MultiDiscrete): - msg += ( - " Try to wrap your environment with the " - "`ray.rllib.env.wrappers.recsim::" - "MultiDiscreteToDiscreteActionWrapper` class: `tune.register_env(" - "[some str], lambda ctx: MultiDiscreteToDiscreteActionWrapper(" - "[your gym env])); config = {'env': [some str]}`" - ) - raise UnsupportedSpaceException(msg) - - -def make_model(policy, obs_space, action_space, config): - _, logit_dim = ModelCatalog.get_action_dist( - action_space, config["model"], framework="tf" - ) - - model_cls = DiscreteLinearModel - - if hasattr(obs_space, "original_space"): - original_space = obs_space.original_space - else: - original_space = obs_space - - exploration_config = config.get("exploration_config") - # Model is dependent on exploration strategy because of its implicitness - - # TODO: Have a separate model catalogue for bandits - if exploration_config: - if exploration_config["type"] == "ThompsonSampling": - if isinstance(original_space, spaces.Dict): - assert ( - "item" in original_space.spaces - ), "Cannot find 'item' key in observation space" - model_cls = ParametricLinearModelThompsonSampling - else: - model_cls = DiscreteLinearModelThompsonSampling - elif exploration_config["type"] == "UpperConfidenceBound": - if isinstance(original_space, spaces.Dict): - assert ( - "item" in original_space.spaces - ), "Cannot find 'item' key in observation space" - model_cls = ParametricLinearModelUCB - else: - model_cls = DiscreteLinearModelUCB - - model = model_cls( - obs_space, action_space, logit_dim, config["model"], name="LinearModel" - ) - return model - - -def after_init(policy, *args): - policy.regrets = [] - BanditPolicyOverrides.__init__(policy) - - -BanditTFPolicy = build_tf_policy( - name="BanditTFPolicy", - get_default_config=lambda: ray.rllib.algorithms.bandit.bandit.BanditConfig(), - validate_spaces=validate_spaces, - make_model=make_model, - loss_fn=None, - mixins=[BanditPolicyOverrides], - after_init=after_init, -) diff --git a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_torch_model.py b/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_torch_model.py deleted file mode 100644 index 5e3b81e1ff8bb..0000000000000 --- a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_torch_model.py +++ /dev/null @@ -1,288 +0,0 @@ -import gymnasium as gym - -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.typing import TensorType - -torch, nn = try_import_torch() - - -class OnlineLinearRegression(nn.Module): - def __init__(self, feature_dim, alpha=1, lambda_=1): - super(OnlineLinearRegression, self).__init__() - self.d = feature_dim - self.alpha = alpha - # Diagonal matrix of size d (feature_dim). - # If lambda=1.0, this will be an identity matrix. - self.precision = nn.Parameter( - data=lambda_ * torch.eye(self.d), requires_grad=False - ) - # Inverse of the above diagnoal. If lambda=1.0, this is also an - # identity matrix. - self.covariance = nn.Parameter( - data=torch.inverse(self.precision), requires_grad=False - ) - # All-0s vector of size d (feature_dim). - self.f = nn.Parameter( - data=torch.zeros( - self.d, - ), - requires_grad=False, - ) - # Dot product between f and covariance matrix - # (batch dim stays intact; reduce dim 1). - self.theta = nn.Parameter( - data=self.covariance.matmul(self.f), requires_grad=False - ) - self._init_params() - self.dist = self._make_dist() - - def _init_params(self): - self.update_schedule = 1 - self.delta_f = 0 - self.delta_b = 0 - self.time = 0 - self.covariance.mul_(self.alpha) - - def _make_dist(self): - """Create a multivariate normal distribution from the current parameters.""" - dist = torch.distributions.multivariate_normal.MultivariateNormal( - loc=self.theta, precision_matrix=self.precision - ) - return dist - - def partial_fit(self, x, y): - x, y = self._check_inputs(x, y) - x = x.squeeze(0) - y = y.item() - self.time += 1 - self.delta_f += y * x - self.delta_b += torch.outer(x, x) - # Can follow an update schedule if not doing sherman morison updates - if self.time % self.update_schedule == 0: - self.precision += self.delta_b - self.f += self.delta_f - self.delta_b = 0 - self.delta_f = 0 - self.covariance.data = torch.inverse(self.precision) - self.theta.data = torch.matmul(self.covariance, self.f) - self.covariance.data *= self.alpha - # the multivariate dist needs to be reconstructed every time - # its parameters are updated.the parameters of the dist do not - # update every time the stored self.covariance and self.theta - # (the mean) are updated - self.dist = self._make_dist() - - def sample_theta(self): - theta = self.dist.sample() - return theta - - def get_ucbs(self, x: TensorType): - """Calculate upper confidence bounds using covariance matrix according - to algorithm 1: LinUCB - (http://proceedings.mlr.press/v15/chu11a/chu11a.pdf). - - Args: - x: Input feature tensor of shape - (batch_size, [num_items]?, feature_dim) - """ - # Fold batch and num-items dimensions into one dim. - if len(x.shape) == 3: - B, C, F = x.shape - x_folded_batch = x.reshape([-1, F]) - # Only batch and feature dims. - else: - x_folded_batch = x - - projections = self.covariance @ x_folded_batch.T - batch_dots = (x_folded_batch * projections.T).sum(dim=-1) - batch_dots = batch_dots.sqrt() - - # Restore original B and C dimensions. - if len(x.shape) == 3: - batch_dots = batch_dots.reshape([B, C]) - return batch_dots - - def forward(self, x: TensorType, sample_theta: bool = False): - """Predict scores on input batch using the underlying linear model. - - Args: - x: Input feature tensor of shape (batch_size, feature_dim) - sample_theta: Whether to sample the weights from its - posterior distribution to perform Thompson Sampling as per - http://proceedings.mlr.press/v28/agrawal13.pdf . - """ - x = self._check_inputs(x) - theta = self.sample_theta() if sample_theta else self.theta - scores = x @ theta - return scores - - def _check_inputs(self, x, y=None): - assert x.ndim in [2, 3], ( - "Input context tensor must be 2 (no batch) or 3 dimensional (where the" - " first dimension is the batch size)." - ) - assert x.shape[-1] == self.d, ( - "Feature dimensions of weights ({}) and context ({}) do not " - "match!".format(self.d, x.shape[-1]) - ) - if y is not None: - assert torch.is_tensor(y), f"ERROR: Target should be a tensor, but is {y}!" - return x if y is None else (x, y) - - -class DiscreteLinearModel(TorchModelV2, nn.Module): - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - TorchModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - nn.Module.__init__(self) - - alpha = model_config.get("alpha", 1) - lambda_ = model_config.get("lambda_", 1) - self.feature_dim = obs_space.sample().size - self.arms = nn.ModuleList( - [ - OnlineLinearRegression( - feature_dim=self.feature_dim, alpha=alpha, lambda_=lambda_ - ) - for i in range(self.num_outputs) - ] - ) - self._cur_value = None - self._cur_ctx = None - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"] - scores = self.predict(x) - return scores, state - - def predict(self, x, sample_theta=False, use_ucb=False): - self._cur_ctx = x - scores = torch.stack( - [self.arms[i](x, sample_theta) for i in range(self.num_outputs)], dim=-1 - ) - if use_ucb: - ucbs = torch.stack( - [self.arms[i].get_ucbs(x) for i in range(self.num_outputs)], dim=-1 - ) - scores += ucbs - self._cur_value = scores - return scores - - def partial_fit(self, x, y, arms): - for i, arm in enumerate(arms): - assert ( - 0 <= arm.item() < len(self.arms) - ), "Invalid arm: {}. It should be 0 <= arm < {}".format( - arm.item(), len(self.arms) - ) - self.arms[arm].partial_fit(x[[i]], y[[i]]) - - @override(ModelV2) - def value_function(self): - assert self._cur_value is not None, "must call forward() first" - return self._cur_value - - def current_obs(self): - assert self._cur_ctx is not None, "must call forward() first" - return self._cur_ctx - - -class DiscreteLinearModelUCB(DiscreteLinearModel): - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"] - scores = super(DiscreteLinearModelUCB, self).predict( - x, sample_theta=False, use_ucb=True - ) - return scores, state - - -class DiscreteLinearModelThompsonSampling(DiscreteLinearModel): - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"] - scores = super(DiscreteLinearModelThompsonSampling, self).predict( - x, sample_theta=True, use_ucb=False - ) - return scores, state - - -class ParametricLinearModel(TorchModelV2, nn.Module): - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - TorchModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - nn.Module.__init__(self) - - alpha = model_config.get("alpha", 1) - lambda_ = model_config.get("lambda_", 0.1) - - # RLlib preprocessors will flatten the observation space and unflatten - # it later. Accessing the original space here. - original_space = obs_space.original_space - assert ( - isinstance(original_space, gym.spaces.Dict) - and "item" in original_space.spaces - ), "This model only supports gym.spaces.Dict observation spaces." - self.feature_dim = original_space["item"].shape[-1] - self.arm = OnlineLinearRegression( - feature_dim=self.feature_dim, alpha=alpha, lambda_=lambda_ - ) - self._cur_value = None - self._cur_ctx = None - - def _check_inputs(self, x): - assert ( - x.ndim == 3 - ), f"ERROR: Inputs ({x}) must have 3 dimensions (B x num-items x features)." - return x - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"]["item"] - x = self._check_inputs(x) - scores = self.predict(x) - return scores, state - - def predict(self, x, sample_theta=False, use_ucb=False): - self._cur_ctx = x - scores = self.arm(x, sample_theta) - if use_ucb: - ucbs = self.arm.get_ucbs(x) - scores += 0.3 * ucbs - self._cur_value = scores - return scores - - def partial_fit(self, x, y, arms): - x = x["item"] - for i, arm in enumerate(arms): - action_id = arm.item() - self.arm.partial_fit(x[[i], action_id], y[[i]]) - - @override(ModelV2) - def value_function(self): - assert self._cur_value is not None, "Must call `forward()` first." - return self._cur_value - - def current_obs(self): - assert self._cur_ctx is not None, "Must call `forward()` first." - return self._cur_ctx - - -class ParametricLinearModelUCB(ParametricLinearModel): - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"]["item"] - x = self._check_inputs(x) - scores = super().predict(x, sample_theta=False, use_ucb=True) - return scores, state - - -class ParametricLinearModelThompsonSampling(ParametricLinearModel): - def forward(self, input_dict, state, seq_lens): - x = input_dict["obs"]["item"] - x = self._check_inputs(x) - scores = super().predict(x, sample_theta=True, use_ucb=False) - return scores, state diff --git a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_torch_policy.py b/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_torch_policy.py deleted file mode 100644 index a4315f5a417ab..0000000000000 --- a/rllib_contrib/bandit/src/rllib_bandit/bandit/bandit_torch_policy.py +++ /dev/null @@ -1,106 +0,0 @@ -import logging -import time - -from gymnasium import spaces -from rllib_bandit.bandit.bandit_torch_model import ( - DiscreteLinearModel, - DiscreteLinearModelThompsonSampling, - DiscreteLinearModelUCB, - ParametricLinearModelThompsonSampling, - ParametricLinearModelUCB, -) - -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import restore_original_dimensions -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.util.debug import log_once - -logger = logging.getLogger(__name__) - - -class BanditTorchPolicy(TorchPolicyV2): - def __init__(self, observation_space, action_space, config): - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - self.regrets = [] - - @override(TorchPolicyV2) - def make_model_and_action_dist(self): - dist_class, logit_dim = ModelCatalog.get_action_dist( - self.action_space, self.config["model"], framework="torch" - ) - model_cls = DiscreteLinearModel - - if hasattr(self.observation_space, "original_space"): - original_space = self.observation_space.original_space - else: - original_space = self.observation_space - - exploration_config = self.config.get("exploration_config") - # Model is dependent on exploration strategy because of its implicitness - - # TODO: Have a separate model catalogue for bandits - if exploration_config: - if exploration_config["type"] == "ThompsonSampling": - if isinstance(original_space, spaces.Dict): - assert ( - "item" in original_space.spaces - ), "Cannot find 'item' key in observation space" - model_cls = ParametricLinearModelThompsonSampling - else: - model_cls = DiscreteLinearModelThompsonSampling - elif exploration_config["type"] == "UpperConfidenceBound": - if isinstance(original_space, spaces.Dict): - assert ( - "item" in original_space.spaces - ), "Cannot find 'item' key in observation space" - model_cls = ParametricLinearModelUCB - else: - model_cls = DiscreteLinearModelUCB - - model = model_cls( - self.observation_space, - self.action_space, - logit_dim, - self.config["model"], - name="LinearModel", - ) - return model, dist_class - - @override(TorchPolicyV2) - def learn_on_batch(self, postprocessed_batch): - train_batch = self._lazy_tensor_dict(postprocessed_batch) - unflattened_obs = restore_original_dimensions( - train_batch[SampleBatch.CUR_OBS], self.observation_space, self.framework - ) - - info = {} - - start = time.time() - self.model.partial_fit( - unflattened_obs, - train_batch[SampleBatch.REWARDS], - train_batch[SampleBatch.ACTIONS], - ) - - infos = postprocessed_batch["infos"] - if "regret" in infos[0]: - regret = sum(row["infos"]["regret"] for row in postprocessed_batch.rows()) - self.regrets.append(regret) - info["cumulative_regret"] = sum(self.regrets) - else: - if log_once("no_regrets"): - logger.warning( - "The env did not report `regret` values in " - "its `info` return, ignoring." - ) - info["update_latency"] = time.time() - start - return {LEARNER_STATS_KEY: info} diff --git a/rllib_contrib/bandit/src/rllib_bandit/env/__init__.py b/rllib_contrib/bandit/src/rllib_bandit/env/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/bandit/src/rllib_bandit/env/bandit_envs_discrete.py b/rllib_contrib/bandit/src/rllib_bandit/env/bandit_envs_discrete.py deleted file mode 100644 index 1ad32366c6a0f..0000000000000 --- a/rllib_contrib/bandit/src/rllib_bandit/env/bandit_envs_discrete.py +++ /dev/null @@ -1,207 +0,0 @@ -import copy -import random - -import gymnasium as gym -import numpy as np -from gymnasium.spaces import Box, Discrete - - -class SimpleContextualBandit(gym.Env): - """Simple env w/ 2 states and 3 actions (arms): 0, 1, and 2. - - Episodes last only for one timestep, possible observations are: - [-1.0, 1.0] and [1.0, -1.0], where the first element is the "current context". - The highest reward (+10.0) is received for selecting arm 0 for context=1.0 - and arm 2 for context=-1.0. Action 1 always yields 0.0 reward. - """ - - def __init__(self, config=None): - self.action_space = Discrete(3) - self.observation_space = Box(low=-1.0, high=1.0, shape=(2,)) - self.cur_context = None - - def reset(self, *, seed=None, options=None): - self.cur_context = random.choice([-1.0, 1.0]) - return np.array([self.cur_context, -self.cur_context]), {} - - def step(self, action): - rewards_for_context = { - -1.0: [-10, 0, 10], - 1.0: [10, 0, -10], - } - reward = rewards_for_context[self.cur_context][action] - return ( - np.array([-self.cur_context, self.cur_context]), - reward, - True, - False, - {"regret": 10 - reward}, - ) - - -class LinearDiscreteEnv(gym.Env): - """Samples data from linearly parameterized arms. - - The reward for context X and arm i is given by X^T * theta_i, for some - latent set of parameters {theta_i : i = 1, ..., k}. - The thetas are sampled uniformly at random, the contexts are Gaussian, - and Gaussian noise is added to the rewards. - """ - - DEFAULT_CONFIG_LINEAR = { - "feature_dim": 8, - "num_actions": 4, - "reward_noise_std": 0.01, - } - - def __init__(self, config=None): - self.config = copy.copy(self.DEFAULT_CONFIG_LINEAR) - if config is not None and type(config) == dict: - self.config.update(config) - - self.feature_dim = self.config["feature_dim"] - self.num_actions = self.config["num_actions"] - self.sigma = self.config["reward_noise_std"] - - self.action_space = Discrete(self.num_actions) - self.observation_space = Box(low=-10, high=10, shape=(self.feature_dim,)) - - self.thetas = np.random.uniform(-1, 1, (self.num_actions, self.feature_dim)) - self.thetas /= np.linalg.norm(self.thetas, axis=1, keepdims=True) - - self._elapsed_steps = 0 - self._current_context = None - - def _sample_context(self): - return np.random.normal(scale=1 / 3, size=(self.feature_dim,)) - - def reset(self, *, seed=None, options=None): - self._current_context = self._sample_context() - return self._current_context, {} - - def step(self, action): - assert ( - self._elapsed_steps is not None - ), "Cannot call env.step() beforecalling reset()" - assert action < self.num_actions, "Invalid action." - - action = int(action) - context = self._current_context - rewards = self.thetas.dot(context) - - opt_action = rewards.argmax() - - regret = rewards.max() - rewards[action] - - # Add Gaussian noise - rewards += np.random.normal(scale=self.sigma, size=rewards.shape) - - reward = rewards[action] - self._current_context = self._sample_context() - return ( - self._current_context, - reward, - True, - False, - {"regret": regret, "opt_action": opt_action}, - ) - - def render(self, mode="human"): - raise NotImplementedError - - -class WheelBanditEnv(gym.Env): - """Wheel bandit environment for 2D contexts - (see https://arxiv.org/abs/1802.09127). - """ - - DEFAULT_CONFIG_WHEEL = { - "delta": 0.5, - "mu_1": 1.2, - "mu_2": 1, - "mu_3": 50, - "std": 0.01, - } - - feature_dim = 2 - num_actions = 5 - - def __init__(self, config=None): - self.config = copy.copy(self.DEFAULT_CONFIG_WHEEL) - if config is not None and type(config) == dict: - self.config.update(config) - - self.delta = self.config["delta"] - self.mu_1 = self.config["mu_1"] - self.mu_2 = self.config["mu_2"] - self.mu_3 = self.config["mu_3"] - self.std = self.config["std"] - - self.action_space = Discrete(self.num_actions) - self.observation_space = Box(low=-1, high=1, shape=(self.feature_dim,)) - - self.means = [self.mu_1] + 4 * [self.mu_2] - self._elapsed_steps = 0 - self._current_context = None - - def _sample_context(self): - while True: - state = np.random.uniform(-1, 1, self.feature_dim) - if np.linalg.norm(state) <= 1: - return state - - def reset(self, *, seed=None, options=None): - self._current_context = self._sample_context() - return self._current_context, {} - - def step(self, action): - assert ( - self._elapsed_steps is not None - ), "Cannot call env.step() before calling reset()" - - action = int(action) - self._elapsed_steps += 1 - rewards = [ - np.random.normal(self.means[j], self.std) for j in range(self.num_actions) - ] - context = self._current_context - r_big = np.random.normal(self.mu_3, self.std) - - if np.linalg.norm(context) >= self.delta: - if context[0] > 0: - if context[1] > 0: - # First quadrant - rewards[1] = r_big - opt_action = 1 - else: - # Fourth quadrant - rewards[4] = r_big - opt_action = 4 - else: - if context[1] > 0: - # Second quadrant - rewards[2] = r_big - opt_action = 2 - else: - # Third quadrant - rewards[3] = r_big - opt_action = 3 - else: - # Smaller region where action 0 is optimal - opt_action = 0 - - reward = rewards[action] - - regret = rewards[opt_action] - reward - - self._current_context = self._sample_context() - return ( - self._current_context, - reward, - True, - False, - {"regret": regret, "opt_action": opt_action}, - ) - - def render(self, mode="human"): - raise NotImplementedError diff --git a/rllib_contrib/bandit/src/rllib_bandit/env/bandit_envs_recommender_system.py b/rllib_contrib/bandit/src/rllib_bandit/env/bandit_envs_recommender_system.py deleted file mode 100644 index 38d73c596fe4a..0000000000000 --- a/rllib_contrib/bandit/src/rllib_bandit/env/bandit_envs_recommender_system.py +++ /dev/null @@ -1,228 +0,0 @@ -"""Examples for recommender system simulating envs ready to be used by - RLlib Trainers. - This env follows RecSim obs and action APIs. -""" -from typing import Optional - -import gymnasium as gym -import numpy as np - -from ray.rllib.utils.numpy import softmax - - -class ParametricRecSys(gym.Env): - """A recommendation environment which generates items with visible features - randomly (parametric actions). - The environment can be configured to be multi-user, i.e. different models - will be learned independently for each user, by setting num_users_in_db - parameter. - To enable slate recommendation, the `slate_size` config parameter can be - set as > 1. - """ - - def __init__( - self, - embedding_size: int = 20, - num_docs_to_select_from: int = 10, - slate_size: int = 1, - num_docs_in_db: Optional[int] = None, - num_users_in_db: Optional[int] = None, - user_time_budget: float = 60.0, - ): - """Initializes a ParametricRecSys instance. - - Args: - embedding_size: Embedding size for both users and docs. - Each value in the user/doc embeddings can have values between - -1.0 and 1.0. - num_docs_to_select_from: The number of documents to present to the - agent each timestep. The agent will then have to pick a slate - out of these. - slate_size: The size of the slate to recommend to the user at each - timestep. - num_docs_in_db: The total number of documents in the DB. Set this - to None, in case you would like to resample docs from an - infinite pool. - num_users_in_db: The total number of users in the DB. Set this to - None, in case you would like to resample users from an infinite - pool. - user_time_budget: The total time budget a user has throughout an - episode. Once this time budget is used up (through engagements - with clicked/selected documents), the episode ends. - """ - self.embedding_size = embedding_size - self.num_docs_to_select_from = num_docs_to_select_from - self.slate_size = slate_size - - self.num_docs_in_db = num_docs_in_db - self.docs_db = None - self.num_users_in_db = num_users_in_db - self.users_db = None - self.current_user = None - - self.user_time_budget = user_time_budget - self.current_user_budget = user_time_budget - - self.observation_space = gym.spaces.Dict( - { - # The D docs our agent sees at each timestep. - # It has to select a k-slate out of these. - "doc": gym.spaces.Dict( - { - str(i): gym.spaces.Box( - -1.0, 1.0, shape=(self.embedding_size,), dtype=np.float32 - ) - for i in range(self.num_docs_to_select_from) - } - ), - # The user engaging in this timestep/episode. - "user": gym.spaces.Box( - -1.0, 1.0, shape=(self.embedding_size,), dtype=np.float32 - ), - # For each item in the previous slate, was it clicked? - # If yes, how long was it being engaged with (e.g. watched)? - "response": gym.spaces.Tuple( - [ - gym.spaces.Dict( - { - # Clicked or not? - "click": gym.spaces.Discrete(2), - # Engagement time (how many minutes watched?). - "engagement": gym.spaces.Box( - 0.0, 100.0, shape=(), dtype=np.float32 - ), - } - ) - for _ in range(self.slate_size) - ] - ), - } - ) - # Our action space is - self.action_space = gym.spaces.MultiDiscrete( - [self.num_docs_to_select_from for _ in range(self.slate_size)] - ) - - def _get_embedding(self): - return np.random.uniform(-1, 1, size=(self.embedding_size,)).astype(np.float32) - - def reset(self, *, seed=None, options=None): - # Reset the current user's time budget. - self.current_user_budget = self.user_time_budget - - # Sample a user for the next episode/session. - # Pick from a only-once-sampled user DB. - if self.num_users_in_db is not None: - if self.users_db is None: - self.users_db = [ - self._get_embedding() for _ in range(self.num_users_in_db) - ] - self.current_user = self.users_db[np.random.choice(self.num_users_in_db)] - # Pick from an infinite pool of users. - else: - self.current_user = self._get_embedding() - - return self._get_obs(), {} - - def step(self, action): - # Action is the suggested slate (indices of the docs in the - # suggested ones). - - # We calculate scores as the dot product between document features and user - # features. The softmax ensures regret<1 further down. - scores = softmax( - [np.dot(self.current_user, doc) for doc in self.currently_suggested_docs] - ) - best_reward = np.max(scores) - - # User choice model: User picks a doc stochastically, - # where probs are dot products between user- and doc feature - # (categories) vectors (rewards). - # There is also a no-click doc whose weight is 0.0. - user_doc_overlaps = np.array([scores[a] for a in action] + [0.0]) - # We have to softmax again so that probabilities add up to 1 - probabilities = softmax(user_doc_overlaps) - which_clicked = np.random.choice( - np.arange(self.slate_size + 1), p=probabilities - ) - - reward = 0.0 - if which_clicked < self.slate_size: - # Reward is 1.0 - regret if clicked. 0.0 if not clicked. - regret = best_reward - user_doc_overlaps[which_clicked] - # The reward also represents the user engagement that we define to be - # withing the range [0...100]. - reward = (1 - regret) * 100 - # If anything clicked, deduct from the current user's time budget. - self.current_user_budget -= 1.0 - done = truncated = self.current_user_budget <= 0.0 - - # Compile response. - response = tuple( - { - "click": int(idx == which_clicked), - "engagement": reward if idx == which_clicked else 0.0, - } - for idx in range(len(user_doc_overlaps) - 1) - ) - - return self._get_obs(response=response), reward, done, truncated, {} - - def _get_obs(self, response=None): - # Sample D docs from infinity or our pre-existing docs. - # Pick from a only-once-sampled docs DB. - if self.num_docs_in_db is not None: - if self.docs_db is None: - self.docs_db = [ - self._get_embedding() for _ in range(self.num_docs_in_db) - ] - self.currently_suggested_docs = [ - self.docs_db[doc_idx].astype(np.float32) - for doc_idx in np.random.choice( - self.num_docs_in_db, - size=(self.num_docs_to_select_from,), - replace=False, - ) - ] - # Pick from an infinite pool of docs. - else: - self.currently_suggested_docs = [ - self._get_embedding() for _ in range(self.num_docs_to_select_from) - ] - - doc = {str(i): d for i, d in enumerate(self.currently_suggested_docs)} - - if not response: - response = self.observation_space["response"].sample() - - return { - "user": self.current_user.astype(np.float32), - "doc": doc, - "response": response, - } - - -if __name__ == "__main__": - """Test RecommSys env with random actions for baseline performance.""" - env = ParametricRecSys( - num_docs_in_db=100, - num_users_in_db=1, - ) - obs, info = env.reset() - num_episodes = 0 - episode_rewards = [] - episode_reward = 0.0 - - while num_episodes < 100: - action = env.action_space.sample() - obs, reward, done, truncated, _ = env.step(action) - - episode_reward += reward - if done: - print(f"episode reward = {episode_reward}") - env.reset() - num_episodes += 1 - episode_rewards.append(episode_reward) - episode_reward = 0.0 - - print(f"Avg reward={np.mean(episode_rewards)}") diff --git a/rllib_contrib/bandit/tests/test_bandit.py b/rllib_contrib/bandit/tests/test_bandit.py deleted file mode 100644 index e5ecb309781e6..0000000000000 --- a/rllib_contrib/bandit/tests/test_bandit.py +++ /dev/null @@ -1,147 +0,0 @@ -import unittest - -import gymnasium as gym -import numpy as np -from gymnasium.spaces import Box, Discrete -from rllib_bandit.bandit.bandit import BanditLinTSConfig, BanditLinUCBConfig -from rllib_bandit.env.bandit_envs_discrete import SimpleContextualBandit - -import ray -from ray.rllib.env import EnvContext -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.test_utils import check, check_train_results, framework_iterator - - -class NonContextualBanditEnv(gym.Env): - def __init__(self, config: EnvContext): - best_arm_prob = config.get("best_arm_prob", 0.5) - self.action_space = Discrete(2) - self.observation_space = Box(0.0, 1.0, shape=(1,), dtype=np.float32) - self.reset(seed=0) - self._arm_probs = {0: 0.1, 1: best_arm_prob} - - def reset(self, *, seed=0, options=None): - self._seed = seed - if seed is not None: - self.rng = np.random.default_rng(self._seed) - return [1.0], {} - - def step(self, action): - reward = self.rng.binomial(1, self._arm_probs[action]) - return [1.0], reward, True, False, {} - - -class TestBandits(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_bandit_lin_ts_compilation(self): - """Test whether BanditLinTS can be built on all frameworks.""" - config = ( - BanditLinTSConfig() - .environment(env=SimpleContextualBandit) - .rollouts(num_rollout_workers=2, num_envs_per_worker=2) - ) - num_iterations = 5 - - for _ in framework_iterator( - config, frameworks=("tf2", "torch"), with_eager_tracing=True - ): - for train_batch_size in [1, 10]: - config.training(train_batch_size=train_batch_size) - algo = config.build() - results = None - for _ in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - # Force good learning behavior (this is a very simple env). - self.assertTrue(results["episode_reward_mean"] == 10.0) - algo.stop() - - def test_bandit_lin_ucb_compilation(self): - """Test whether BanditLinUCB can be built on all frameworks.""" - config = ( - BanditLinUCBConfig() - .environment(env=SimpleContextualBandit) - .rollouts(num_envs_per_worker=2) - ) - - num_iterations = 5 - - for _ in framework_iterator( - config, frameworks=("tf2", "torch"), with_eager_tracing=True - ): - for train_batch_size in [1, 10]: - config.training(train_batch_size=train_batch_size) - algo = config.build() - results = None - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - # Force good learning behavior (this is a very simple env). - self.assertTrue(results["episode_reward_mean"] == 10.0) - algo.stop() - - def test_bandit_convergence(self): - # test whether in a simple bandit environment, the bandit algorithm - # distribution converge to the optimal distribution empirically - - std_threshold = 0.1 - best_arm_prob = 0.5 - - for config_cls in [BanditLinUCBConfig, BanditLinTSConfig]: - config = ( - config_cls() - .debugging(seed=0) - .environment( - env=NonContextualBanditEnv, - env_config={"best_arm_prob": best_arm_prob}, - ) - ) - for _ in framework_iterator( - config, frameworks=("tf2", "torch"), with_eager_tracing=True - ): - algo = config.build() - model = algo.get_policy().model - arm_means, arm_stds = [], [] - for _ in range(50): - # TODO the internals of the model is leaking here. - # We should revisit this once the RLModule is merged in. - samples = [model.arms[i].dist.sample((1000,)) for i in range(2)] - arm_means.append( - [float(convert_to_numpy(s).mean(0)) for s in samples] - ) - arm_stds.append( - [float(convert_to_numpy(s).std(0)) for s in samples] - ) - algo.train() - - best_arm = np.argmax(arm_means[-1]) - print( - f"best arm: {best_arm}, arm means: {arm_means[-1]}, " - f"arm stds: {arm_stds[-1]}" - ) - - # the better arm (according to the learned model) should be - # sufficiently exploited so it should have a low variance at convergence - self.assertLess(arm_stds[-1][best_arm], std_threshold) - - # best arm should also have a good estimate of its actual mean - # Note that this may not be true for non-optimal arms as they may not - # have been explored enough - check(arm_means[-1][best_arm], best_arm_prob, decimals=1) - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/bandit/tuned_examples/__init__.py b/rllib_contrib/bandit/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/bandit/tuned_examples/interest-evolution-recsim-env-bandit-linucb.yaml b/rllib_contrib/bandit/tuned_examples/interest-evolution-recsim-env-bandit-linucb.yaml deleted file mode 100644 index ab9530d922ce3..0000000000000 --- a/rllib_contrib/bandit/tuned_examples/interest-evolution-recsim-env-bandit-linucb.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -interest-evolution-recsim-env-bandit-linucb: - env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv - run: BanditLinUCB - stop: - sampler_results/episode_reward_mean: 180.0 - timesteps_total: 50000 - config: - framework: torch - - # RLlib/RecSim wrapper specific settings: - env_config: - # Env class specified above takes one `config` arg in its c'tor: - config: - # Each step, sample `num_candidates` documents using the env-internal - # document sampler model (a logic that creates n documents to select - # the slate from). - resample_documents: true - num_candidates: 100 - # How many documents to recommend (out of `num_candidates`) each - # timestep? - slate_size: 2 - # Should the action space be purely Discrete? Useful for algos that - # don't support MultiDiscrete (e.g. DQN or Bandits). - # SlateQ handles MultiDiscrete action spaces. - convert_to_discrete_action_space: true - wrap_for_bandits: true - seed: 0 - - metrics_num_episodes_for_smoothing: 500 diff --git a/rllib_contrib/contributing.md b/rllib_contrib/contributing.md deleted file mode 100644 index 9ebe67600d9dc..0000000000000 --- a/rllib_contrib/contributing.md +++ /dev/null @@ -1,29 +0,0 @@ -# Contributing Guidelines - -Any issues that are filed in `rllib_contrib` will be solved best-effort by the community and there is no expectation of maintenance by the RLlib team. - -**The api surface between algorithms in rllib_contrib and current versions of ray / rllib is not guaranteed. This means that any apis that are used in rllib_contrib could potentially become modified/removed in newer version of ray/rllib. You should check the version of ray that an algorithm is using before making any modifications, and refer to that documentation / release on github.** - -We will generally accept contributions to this repo that meet any of the following criteria: - -1. Updating dependencies. -2. Submitting community contributed algorithms that have been tested and are ready for use. -3. Enabling algorithms to be run in different environments (ex. adding support for a new type of gym environment). -4. Updating algorithms for use with the newer RLlib APIs. -5. General bug fixes. - -We will not accept contributions that generally add a significant maintenance burden. In this case users should instead make their own repo with their contribution, **using the same guidelines as this repo**, and the RLlib team can help to market/promote it in the ray docs. - -## Contributing new algorithms - -If users would like to contribute a new algorithm tor rllib_contrib, they should follow these steps: -1. Create a new directory with the same structure as the other algorithms. -2. Add a `README.md` file that describes the algorithm and its usecases. -3. Create unit tests/shorter learning tests and long learning tests for the algorithm. -4. Submit a PR, add the tag `rllib_contrib`, and then a RLlib maintainer will review it and help you set up your testing to integrate with the CI of this repo. - -Regarding unit tests and long running tests: - -- Unit tests are any tests that tests a sub component of an algorithm. For example tests that check the value of a loss function given some inputs. -- Short learning tests should run an algorithm on an easy to learn environment for a short amount of time (e.g. ~3 minutes) and check that the algorithm is achieving some learning threshold (e.g. reward mean or loss). -- Long learning tests should run an algorithm on a hard to learn environment (e.g.) for a long amount of time (e.g. ~1 hour) and check that the algorithm is achieving some learning threshold (e.g. reward mean or loss). diff --git a/rllib_contrib/crr/BUILD b/rllib_contrib/crr/BUILD deleted file mode 100644 index 8400eb82a4292..0000000000000 --- a/rllib_contrib/crr/BUILD +++ /dev/null @@ -1,36 +0,0 @@ -# Examples - -py_test( - name = "example_crr_cartpole_v1", - main = "crr_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/crr_cartpole_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -# py_test( -# name = "learning_tests_pendulum_crr", -# main = "run_regression_tests.py", -# tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], -# size = "enormous", -# srcs = ["run_regression_tests.py"], -# data = [ -# "tuned_examples/pendulum-v1-crr.yaml", -# # Include the offline json data file as well. -# "tuned_examples/pendulum_replay_v1.1.0.zip", -# ], -# args = ["--dir=crr/tuned_examples/"] -# ) - - -# Compilation Tests - -py_test( - name = "test_crr", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_crr.py"] -) diff --git a/rllib_contrib/crr/README.md b/rllib_contrib/crr/README.md deleted file mode 100644 index dbdf1c734f987..0000000000000 --- a/rllib_contrib/crr/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# CRR (Critic Regularized Regression) - - -[CRR](https://arxiv.org/abs/2006.15134) is another offline RL algorithm based on Q-learning that can learn from an offline experience replay. The challenge in applying existing Q-learning algorithms to offline RL lies in the overestimation of the Q-function, as well as, the lack of exploration beyond the observed data. The latter becomes increasingly important during bootstrapping in the bellman equation, where the Q-function queried for the next state’s Q-value(s) does not have support in the observed data. To mitigate these issues, CRR implements a simple and yet powerful idea of “value-filtered regression”. The key idea is to use a learned critic to filter-out the non-promising transitions from the replay dataset. - - -## Installation - -``` -conda create -n rllib-crr python=3.10 -conda activate rllib-crr -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[CRR Example]() \ No newline at end of file diff --git a/rllib_contrib/crr/examples/crr_cartpole_v1.py b/rllib_contrib/crr/examples/crr_cartpole_v1.py deleted file mode 100644 index 3e47101973b51..0000000000000 --- a/rllib_contrib/crr/examples/crr_cartpole_v1.py +++ /dev/null @@ -1,84 +0,0 @@ -import argparse - -from rllib_crr.crr import CRR, CRRConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - config = ( - CRRConfig() - .environment(env="CartPole-v1", clip_actions=True) - .framework("torch") - .offline_data( - input_="dataset", - input_config={ - "format": "json", - "paths": ["s3://anonymous@air-example-data/rllib/cartpole/large.json"], - }, - actions_in_input_normalized=True, - ) - .training( - twin_q=True, - weight_type="exp", - advantage_type="mean", - n_action_sample=4, - target_network_update_freq=10000, - tau=0.0005, - gamma=0.99, - train_batch_size=2048, - critic_hidden_activation="tanh", - critic_hiddens=[128, 128, 128], - critic_lr=0.0003, - actor_hidden_activation="tanh", - actor_hiddens=[128, 128, 128], - actor_lr=0.0003, - temperature=1.0, - max_weight=20.0, - ) - .evaluation( - evaluation_interval=1, - evaluation_num_workers=1, - evaluation_duration=10, - evaluation_duration_unit="episodes", - evaluation_parallel_to_training=True, - evaluation_config=CRRConfig.overrides(input_="sampler", explore=False), - ) - .rollouts(num_rollout_workers=3) - ) - - stop_reward = 200 - - tuner = tune.Tuner( - CRR, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "evaluation/sampler_results/episode_reward_mean": stop_reward, - "training_iteration": 100, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved( - results, - stop_reward, - metric="evaluation/sampler_results/episode_reward_mean", - ) diff --git a/rllib_contrib/crr/pyproject.toml b/rllib_contrib/crr/pyproject.toml deleted file mode 100644 index 75efc151f3d75..0000000000000 --- a/rllib_contrib/crr/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-crr" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/crr/requirements.txt b/rllib_contrib/crr/requirements.txt deleted file mode 100644 index b07006a1b4ec6..0000000000000 --- a/rllib_contrib/crr/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/crr/src/rllib_crr/crr/__init__.py b/rllib_contrib/crr/src/rllib_crr/crr/__init__.py deleted file mode 100644 index b28530bcec15a..0000000000000 --- a/rllib_contrib/crr/src/rllib_crr/crr/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from rllib_crr.crr.crr import CRR, CRRConfig -from rllib_crr.crr.crr_torch_model import CRRModel -from rllib_crr.crr.crr_torch_policy import CRRTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = ["CRR", "CRRConfig", "CRRModel", "CRRTorchPolicy"] - -register_trainable("rllib-contrib-crr", CRR) diff --git a/rllib_contrib/crr/src/rllib_crr/crr/crr.py b/rllib_contrib/crr/src/rllib_crr/crr/crr.py deleted file mode 100644 index 6680fd737f37d..0000000000000 --- a/rllib_contrib/crr/src/rllib_crr/crr/crr.py +++ /dev/null @@ -1,281 +0,0 @@ -import logging -from typing import List, Optional, Type - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.execution import synchronous_parallel_sample -from ray.rllib.execution.train_ops import multi_gpu_train_one_step, train_one_step -from ray.rllib.policy import Policy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.metrics import ( - LAST_TARGET_UPDATE_TS, - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, - NUM_TARGET_UPDATES, - SAMPLE_TIMER, - TARGET_NET_UPDATE_TIMER, -) -from ray.rllib.utils.typing import ResultDict - -logger = logging.getLogger(__name__) - - -class CRRConfig(AlgorithmConfig): - def __init__(self, algo_class=None): - super().__init__(algo_class=algo_class or CRR) - - # fmt: off - # __sphinx_doc_begin__ - # CRR-specific settings. - self.weight_type = "bin" - self.temperature = 1.0 - self.max_weight = 20.0 - self.advantage_type = "mean" - self.n_action_sample = 4 - self.twin_q = True - self.train_batch_size = 128 - - # target_network_update_freq by default is 100 * train_batch_size - # if target_network_update_freq is not set. See self.setup for code. - self.target_network_update_freq = None - # __sphinx_doc_end__ - # fmt: on - self.actor_hiddens = [256, 256] - self.actor_hidden_activation = "relu" - self.critic_hiddens = [256, 256] - self.critic_hidden_activation = "relu" - self.critic_lr = 3e-4 - self.actor_lr = 3e-4 - self.tau = 5e-3 - - # Override the AlgorithmConfig default: - # Only PyTorch supported thus far. Make this the default framework. - self.framework_str = "torch" - # If data ingestion/sample_time is slow, increase this - self.num_rollout_workers = 4 - self.offline_sampling = True - self.min_time_s_per_iteration = 10.0 - - self.td_error_loss_fn = "mse" - self.categorical_distribution_temperature = 1.0 - - # TODO (Artur): CRR should not need an exploration config as an offline - # algorithm. However, the current implementation of the CRR algorithm - # requires it. Investigate. - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - - def training( - self, - *, - weight_type: Optional[str] = NotProvided, - temperature: Optional[float] = NotProvided, - max_weight: Optional[float] = NotProvided, - advantage_type: Optional[str] = NotProvided, - n_action_sample: Optional[int] = NotProvided, - twin_q: Optional[bool] = NotProvided, - target_network_update_freq: Optional[int] = NotProvided, - actor_hiddens: Optional[List[int]] = NotProvided, - actor_hidden_activation: Optional[str] = NotProvided, - critic_hiddens: Optional[List[int]] = NotProvided, - critic_hidden_activation: Optional[str] = NotProvided, - tau: Optional[float] = NotProvided, - td_error_loss_fn: Optional[str] = NotProvided, - categorical_distribution_temperature: Optional[float] = NotProvided, - actor_lr: Optional[float] = NotProvided, - critic_lr: Optional[float] = NotProvided, - **kwargs, - ) -> "CRRConfig": - - r""" - CRR training configuration - - Args: - weight_type: weight type to use `bin` | `exp`. - temperature: the exponent temperature used in exp weight type. - max_weight: the max weight limit for exp weight type. - advantage_type: The way we reduce q values to v_t values - `max` | `mean` | `expectation`. `max` and `mean` work for both - discrete and continuous action spaces while `expectation` only - works for discrete action spaces. - `max`: Uses max over sampled actions to estimate the value. - - .. math:: - - A(s_t, a_t) = Q(s_t, a_t) - \max_{a^j} Q(s_t, a^j) - - where :math:`a^j` is `n_action_sample` times sampled from the - policy :math:`\pi(a | s_t)` - `mean`: Uses mean over sampled actions to estimate the value. - - .. math:: - - A(s_t, a_t) = Q(s_t, a_t) - \frac{1}{m}\sum_{j=1}^{m} - [Q(s_t, a^j)] - - where :math:`a^j` is `n_action_sample` times sampled from the - policy :math:`\pi(a | s_t)` - `expectation`: This uses categorical distribution to evaluate - the expectation of the q values directly to estimate the value. - - .. math:: - - A(s_t, a_t) = Q(s_t, a_t) - E_{a^j\sim \pi(a|s_t)}[Q(s_t,a^j)] - - n_action_sample: the number of actions to sample for v_t estimation. - twin_q: if True, uses pessimistic q estimation. - target_network_update_freq: The frequency at which we update the - target copy of the model in terms of the number of gradient updates - applied to the main model. - actor_hiddens: The number of hidden units in the actor's fc network. - actor_hidden_activation: The activation used in the actor's fc network. - critic_hiddens: The number of hidden units in the critic's fc network. - critic_hidden_activation: The activation used in the critic's fc network. - tau: Polyak averaging coefficient - (making it 1 is reduces it to a hard update). - td_error_loss_fn: "huber" or "mse". - Loss function for calculating critic error. - categorical_distribution_temperature: Set the temperature parameter used - by Categorical action distribution. A valid temperature is in the range - of [0, 1]. Note that this mostly affects evaluation since critic error - uses selected action for return calculation. - actor_lr: Learning rate for the actor. - critic_lr: Learning rate for the critic. - **kwargs: forward compatibility kwargs - - Returns: - This updated CRRConfig object. - """ - super().training(**kwargs) - - if weight_type is not NotProvided: - self.weight_type = weight_type - if temperature is not NotProvided: - self.temperature = temperature - if max_weight is not NotProvided: - self.max_weight = max_weight - if advantage_type is not NotProvided: - self.advantage_type = advantage_type - if n_action_sample is not NotProvided: - self.n_action_sample = n_action_sample - if twin_q is not NotProvided: - self.twin_q = twin_q - if target_network_update_freq is not NotProvided: - self.target_network_update_freq = target_network_update_freq - if actor_hiddens is not NotProvided: - self.actor_hiddens = actor_hiddens - if actor_hidden_activation is not NotProvided: - self.actor_hidden_activation = actor_hidden_activation - if critic_hiddens is not NotProvided: - self.critic_hiddens = critic_hiddens - if critic_hidden_activation is not NotProvided: - self.critic_hidden_activation = critic_hidden_activation - if tau is not NotProvided: - self.tau = tau - if td_error_loss_fn is not NotProvided: - self.td_error_loss_fn = td_error_loss_fn - if categorical_distribution_temperature is not NotProvided: - self.categorical_distribution_temperature = ( - categorical_distribution_temperature - ) - if actor_lr is not NotProvided: - self.actor_lr = actor_lr - if critic_lr is not NotProvided: - self.critic_lr = critic_lr - - return self - - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if self.td_error_loss_fn not in ["huber", "mse"]: - raise ValueError("`td_error_loss_fn` must be 'huber' or 'mse'!") - - -NUM_GRADIENT_UPDATES = "num_grad_updates" - - -class CRR(Algorithm): - - # TODO: we have a circular dependency for get - # default config. config -> Trainer -> config - # defining Config class in the same file for now as a workaround. - - def setup(self, config: AlgorithmConfig): - super().setup(config) - - self.target_network_update_freq = self.config.target_network_update_freq - if self.target_network_update_freq is None: - self.target_network_update_freq = self.config.train_batch_size * 100 - # added a counter key for keeping track of number of gradient updates - self._counters[NUM_GRADIENT_UPDATES] = 0 - # if I don't set this here to zero I won't see zero in the logs (defaultdict) - self._counters[NUM_TARGET_UPDATES] = 0 - - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return CRRConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - from rllib_crr.crr.crr_torch_policy import CRRTorchPolicy - - return CRRTorchPolicy - else: - raise ValueError("Non-torch frameworks are not supported yet!") - - @override(Algorithm) - def training_step(self) -> ResultDict: - with self._timers[SAMPLE_TIMER]: - train_batch = synchronous_parallel_sample(worker_set=self.workers) - train_batch = train_batch.as_multi_agent() - self._counters[NUM_AGENT_STEPS_SAMPLED] += train_batch.agent_steps() - self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps() - - # Postprocess batch before we learn on it. - post_fn = self.config.get("before_learn_on_batch") or (lambda b, *a: b) - train_batch = post_fn(train_batch, self.workers, self.config) - - # Learn on training batch. - # Use simple optimizer (only for multi-agent or tf-eager; all other - # cases should use the multi-GPU optimizer, even if only using 1 GPU) - if self.config.get("simple_optimizer", False): - train_results = train_one_step(self, train_batch) - else: - train_results = multi_gpu_train_one_step(self, train_batch) - - # update target every few gradient updates - # Update target network every `target_network_update_freq` training steps. - cur_ts = self._counters[ - NUM_AGENT_STEPS_TRAINED - if self.config.count_steps_by == "agent_steps" - else NUM_ENV_STEPS_TRAINED - ] - last_update = self._counters[LAST_TARGET_UPDATE_TS] - - if cur_ts - last_update >= self.target_network_update_freq: - with self._timers[TARGET_NET_UPDATE_TIMER]: - to_update = self.workers.local_worker().get_policies_to_train() - self.workers.local_worker().foreach_policy_to_train( - lambda p, pid: pid in to_update and p.update_target() - ) - self._counters[NUM_TARGET_UPDATES] += 1 - self._counters[LAST_TARGET_UPDATE_TS] = cur_ts - - self._counters[NUM_GRADIENT_UPDATES] += 1 - return train_results diff --git a/rllib_contrib/crr/src/rllib_crr/crr/crr_torch_model.py b/rllib_contrib/crr/src/rllib_crr/crr/crr_torch_model.py deleted file mode 100644 index d8f847ec9444f..0000000000000 --- a/rllib_contrib/crr/src/rllib_crr/crr/crr_torch_model.py +++ /dev/null @@ -1,195 +0,0 @@ -from typing import Dict, List, Union - -import gymnasium as gym -import numpy as np - -from ray.rllib.models.torch.misc import SlimFC -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.models.utils import get_activation_fn -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.typing import ModelConfigDict, TensorType - -torch, nn = try_import_torch() - - -class CRRModel(TorchModelV2, nn.Module): - def __init__( - self, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - num_outputs: int, - model_config: ModelConfigDict, - name: str, - ): - TorchModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - nn.Module.__init__(self) - - self._is_action_discrete = isinstance(action_space, gym.spaces.Discrete) - - # TODO: I don't know why this is true yet? (in = num_outputs) - self.obs_ins = num_outputs - self.action_dim = np.prod(self.action_space.shape) - self.actor_model = self._build_actor_net("actor") - twin_q = self.model_config["twin_q"] - self.q_model = self._build_q_net("q") - if twin_q: - self.twin_q_model = self._build_q_net("twin_q") - else: - self.twin_q_model = None - - def _build_actor_net(self, name_): - actor_hidden_activation = self.model_config["actor_hidden_activation"] - actor_hiddens = self.model_config["actor_hiddens"] - - # Build the policy network. - actor_net = nn.Sequential() - - activation = get_activation_fn(actor_hidden_activation, framework="torch") - ins = self.obs_ins - for i, n in enumerate(actor_hiddens): - actor_net.add_module( - f"{name_}_hidden_{i}", - SlimFC( - ins, - n, - initializer=torch.nn.init.xavier_uniform_, - activation_fn=activation, - ), - ) - ins = n - - # also includes log_std in continuous case - n_act_out = ( - self.action_space.n if self._is_action_discrete else 2 * self.action_dim - ) - actor_net.add_module( - f"{name_}_out", - SlimFC( - ins, - n_act_out, - initializer=torch.nn.init.xavier_uniform_, - activation_fn=None, - ), - ) - - return actor_net - - def _build_q_net(self, name_): - # actions are concatenated with flattened obs - critic_hidden_activation = self.model_config["critic_hidden_activation"] - critic_hiddens = self.model_config["critic_hiddens"] - - activation = get_activation_fn(critic_hidden_activation, framework="torch") - q_net = nn.Sequential() - ins = ( - self.obs_ins if self._is_action_discrete else self.obs_ins + self.action_dim - ) - for i, n in enumerate(critic_hiddens): - q_net.add_module( - f"{name_}_hidden_{i}", - SlimFC( - ins, - n, - initializer=torch.nn.init.xavier_uniform_, - activation_fn=activation, - ), - ) - ins = n - - q_net.add_module( - f"{name_}_out", - SlimFC( - ins, - self.action_space.n if self._is_action_discrete else 1, - initializer=torch.nn.init.xavier_uniform_, - activation_fn=None, - ), - ) - return q_net - - def _get_q_value( - self, model_out: TensorType, actions: TensorType, q_model: TorchModelV2 - ) -> TensorType: - - if self._is_action_discrete: - rows = torch.arange(len(actions)).to(actions) - q_vals = q_model(model_out)[rows, actions].unsqueeze(-1) - else: - q_vals = q_model(torch.cat([model_out, actions], -1)) - - return q_vals - - def get_q_values(self, model_out: TensorType, actions: TensorType) -> TensorType: - """Return the Q estimates for the most recent forward pass. - - This implements Q(s, a). - - Args: - model_out: obs embeddings from the model layers. - Shape: [BATCH_SIZE, num_outputs]. - actions: Actions to return the Q-values for. - Shape: [BATCH_SIZE, action_dim]. - - Returns: - The q_values based on Q(S,A). - Shape: [BATCH_SIZE]. - """ - return self._get_q_value(model_out, actions, self.q_model) - - def get_twin_q_values( - self, model_out: TensorType, actions: TensorType - ) -> TensorType: - """Same as get_q_values but using the twin Q net. - - This implements the twin Q(s, a). - - Args: - model_out: obs embeddings from the model layers. - Shape: [BATCH_SIZE, num_outputs]. - actions: Actions to return the Q-values for. - Shape: [BATCH_SIZE, action_dim]. - - Returns: - The q_values based on Q_{twin}(S,A). - Shape: [BATCH_SIZE]. - """ - return self._get_q_value(model_out, actions, self.twin_q_model) - - def get_policy_output(self, model_out: TensorType) -> TensorType: - """Return the action output for the most recent forward pass. - - This outputs the support for pi(s). For continuous action spaces, this - is the action directly. For discrete, it is the mean / std dev. - - Args: - model_out: obs embeddings from the model layers. - Shape: [BATCH_SIZE, num_outputs]. - - Returns: - The output of pi(s). - Shape: [BATCH_SIZE, action_out_size]. - """ - return self.actor_model(model_out) - - def policy_variables( - self, as_dict: bool = False - ) -> Union[List[TensorType], Dict[str, TensorType]]: - """Return the list of variables for the policy net.""" - if as_dict: - return self.actor_model.state_dict() - return list(self.actor_model.parameters()) - - def q_variables( - self, as_dict=False - ) -> Union[List[TensorType], Dict[str, TensorType]]: - """Return the list of variables for Q / twin Q nets.""" - if as_dict: - return { - **self.q_model.state_dict(), - **(self.twin_q_model.state_dict() if self.twin_q_model else {}), - } - return list(self.q_model.parameters()) + ( - list(self.twin_q_model.parameters()) if self.twin_q_model else [] - ) diff --git a/rllib_contrib/crr/src/rllib_crr/crr/crr_torch_policy.py b/rllib_contrib/crr/src/rllib_crr/crr/crr_torch_policy.py deleted file mode 100644 index bbf3bb8af7559..0000000000000 --- a/rllib_contrib/crr/src/rllib_crr/crr/crr_torch_policy.py +++ /dev/null @@ -1,419 +0,0 @@ -from typing import Dict, List, Tuple, Type, Union, cast - -import gymnasium as gym -import numpy as np -from rllib_crr.crr.crr_torch_model import CRRModel - -from ray.rllib.algorithms import AlgorithmConfig -from ray.rllib.algorithms.ddpg.noop_model import TorchNoopModel -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import ( - TorchCategorical, - TorchDistributionWrapper, - get_torch_categorical_class_with_temperature, -) -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import TargetNetworkMixin -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import huber_loss, l2_loss -from ray.rllib.utils.typing import TensorType, TrainerConfigDict - -torch, nn = try_import_torch() - - -class CRRTorchPolicy(TorchPolicyV2, TargetNetworkMixin): - def __init__( - self, - observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: TrainerConfigDict, - ): - - self.target_model = None # assign it in self.make_model - self._is_action_discrete = isinstance(action_space, gym.spaces.Discrete) - - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - - # For discreet action space, use a custom TorchCategorical distribution - # that supports temperature. - if self._is_action_discrete: - assert self.dist_class == TorchCategorical - self.dist_class = get_torch_categorical_class_with_temperature( - config["categorical_distribution_temperature"] - ) - - """ - by here your model should include the following - (We assume state s is already encoded and there is no need to use RNNs/other - models to encode observations into states): - 1. a nn representing the actor pi(a|s) - 1.1* in case of continuous actions it should be normal / squashed normal - dist if the action space is bounded? - 1.2 in case of of discrete set of actions the output of the model should be - a discrete distribution over action classes - 2. a nn representing the critic Q(s, a) - 2.1* in case of continuous actions it should take in concat([s,a]) and output - a single scalar - 2.2 in case of discrete actions it should take in s and output a logit for - each action class as well as a scale for matching the reward scale. - 3. for critic it should have n_critic copies of the Q function nn - 4. for each critic it should have a target model copy - """ - - def action_distribution_fn( - self, - model: ModelV2, - *, - obs_batch: TensorType, - state_batches: TensorType, - **kwargs, - ) -> Tuple[TensorType, type, List[TensorType]]: - - model_out, _ = model(obs_batch) - dist_input = model.get_policy_output(model_out) - dist_class = self.dist_class - - return dist_input, dist_class, [] - - def make_model(self) -> ModelV2: - # copying ddpg build model to here to be explicit - model_config = self.config["model"] - model_config.update( - dict( - actor_hidden_activation=self.config["actor_hidden_activation"], - actor_hiddens=self.config["actor_hiddens"], - critic_hidden_activation=self.config["critic_hidden_activation"], - critic_hiddens=self.config["critic_hiddens"], - twin_q=self.config["twin_q"], - ) - ) - num_outputs = int(np.prod(self.observation_space.shape)) - - # TODO: why do we even have to go through this get_model_v2 function? - self.model = ModelCatalog.get_model_v2( - obs_space=self.observation_space, - action_space=self.action_space, - num_outputs=num_outputs, - model_config=model_config, - framework=self.config["framework"], - # use this model for interface (get_q, get_q_twin, .etc) - model_interface=CRRModel, - default_model=TorchNoopModel, - name="model", - ) - - # TODO: this is a bad python pattern to assign attributes that do not exist in - # the constructor - self.target_model = ModelCatalog.get_model_v2( - obs_space=self.observation_space, - action_space=self.action_space, - num_outputs=num_outputs, - model_config=model_config, - framework=self.config["framework"], - # use this model for interface (get_q, get_q_twin, .etc) - model_interface=CRRModel, - default_model=TorchNoopModel, - name="target_model", - ) - - return self.model - - def optimizer( - self, - ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]: - - # Set epsilons to match tf.keras.optimizers.Adam's epsilon default. - actor_optimizer = torch.optim.Adam( - params=self.model.policy_variables(), - lr=self.config["actor_lr"], - betas=(0.9, 0.999), - eps=1e-8, - ) - - critic_optimizer = torch.optim.Adam( - params=self.model.q_variables(), - lr=self.config["critic_lr"], - betas=(0.9, 0.999), - eps=1e-8, - ) - - # Return them in the same order as the respective loss terms are returned. - return actor_optimizer, critic_optimizer - - def loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - - # update the actor - # compute the weights assigned to every transition - # (s_t, a_t) and log(pi(a_t|s_t)) - self._compute_action_weights_and_logps(model, dist_class, train_batch) - - # compute actor loss - actor_loss = self._compute_actor_loss(model, dist_class, train_batch) - - # update the critic - # standard critic update with pessimistic Q-learning (e.g. DQN) - critic_loss = self._compute_critic_loss(model, dist_class, train_batch) - - self.log("loss_actor", actor_loss) - self.log("loss_critic", critic_loss) - - return actor_loss, critic_loss - - def log(self, key, value): - # internal log function - self.model.tower_stats[key] = value - - # def update_target(self): - # tau = self.config['tau'] - # - # model_params = self.model.parameters() - # target_params = self.target_models[self.mode].parameters() - # for src_p, trg_p in zip(model_params, target_params): - # trg_p.data = (1 - tau) * trg_p.data + tau * src_p.data - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - stats_dict = { - k: torch.stack(self.get_tower_stats(k)).mean().item() - for k in self.model.tower_stats - } - return stats_dict - - def _get_q_value( - self, model: ModelV2, model_out: TensorType, actions: TensorType - ) -> TensorType: - # helper function to compute the pessimistic q value - q1 = model.get_q_values(model_out, actions) - q2 = model.get_twin_q_values(model_out, actions) - return torch.minimum(q1, q2) - - def _compute_adv_and_logps( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> None: - # uses mean|max|expectation to compute estimate of advantages - # continuous/discrete action spaces: - # for max: - # A(s_t, a_t) = Q(s_t, a_t) - max_{a^j} Q(s_t, a^j) - # where a^j is m times sampled from the policy p(a | s_t) - # for mean: - # A(s_t, a_t) = Q(s_t, a_t) - avg( Q(s_t, a^j) ) - # where a^j is m times sampled from the policy p(a | s_t) - # discrete action space and adv_type=expectation: - # A(s_t, a_t) = Q(s_t, a_t) - sum_j[Q(s_t, a^j) * pi(a^j)] - advantage_type = self.config["advantage_type"] - n_action_sample = self.config["n_action_sample"] - batch_size = len(train_batch) - out_t, _ = model(train_batch) - - # construct pi(s_t) and Q(s_t, a_t) for computing advantage actions - pi_s_t = dist_class(model.get_policy_output(out_t), model) - q_t = self._get_q_value(model, out_t, train_batch[SampleBatch.ACTIONS]) - - # compute the logp of the actions in the dataset (for computing actor's loss) - action_logp = pi_s_t.dist.log_prob(train_batch[SampleBatch.ACTIONS]) - - # fix the shape if it's not canonical (i.e. shape[-1] != 1) - if len(action_logp.shape) <= 1: - action_logp.unsqueeze_(-1) - train_batch[SampleBatch.ACTION_LOGP] = action_logp - - if advantage_type == "expectation": - assert ( - self._is_action_discrete - ), "Action space should be discrete when advantage_type = expectation." - assert hasattr( - self.model, "q_model" - ), "CRR's ModelV2 should have q_model neural network in discrete \ - action spaces" - assert isinstance( - pi_s_t.dist, torch.distributions.Categorical - ), "The output of the policy should be a torch Categorical \ - distribution." - - q_vals = self.model.q_model(out_t) - if hasattr(self.model, "twin_q_model"): - q_twins = self.model.twin_q_model(out_t) - q_vals = torch.minimum(q_vals, q_twins) - - probs = pi_s_t.dist.probs - v_t = (q_vals * probs).sum(-1, keepdims=True) - else: - policy_actions = pi_s_t.dist.sample((n_action_sample,)) # samples - - if self._is_action_discrete: - flat_actions = policy_actions.reshape(-1) - else: - flat_actions = policy_actions.reshape(-1, *self.action_space.shape) - - reshaped_s_t = train_batch[SampleBatch.OBS].view( - 1, batch_size, *self.observation_space.shape - ) - reshaped_s_t = reshaped_s_t.expand( - n_action_sample, batch_size, *self.observation_space.shape - ) - flat_s_t = reshaped_s_t.reshape(-1, *self.observation_space.shape) - - input_v_t = SampleBatch( - **{SampleBatch.OBS: flat_s_t, SampleBatch.ACTIONS: flat_actions} - ) - out_v_t, _ = model(input_v_t) - - flat_q_st_pi = self._get_q_value(model, out_v_t, flat_actions) - reshaped_q_st_pi = flat_q_st_pi.reshape(-1, batch_size, 1) - - if advantage_type == "mean": - v_t = reshaped_q_st_pi.mean(dim=0) - elif advantage_type == "max": - v_t, _ = reshaped_q_st_pi.max(dim=0) - else: - raise ValueError(f"Invalid advantage type: {advantage_type}.") - - adv_t = q_t - v_t - train_batch["advantages"] = adv_t - - # logging - self.log("q_batch_avg", q_t.mean()) - self.log("q_batch_max", q_t.max()) - self.log("q_batch_min", q_t.min()) - self.log("v_batch_avg", v_t.mean()) - self.log("v_batch_max", v_t.max()) - self.log("v_batch_min", v_t.min()) - self.log("adv_batch_avg", adv_t.mean()) - self.log("adv_batch_max", adv_t.max()) - self.log("adv_batch_min", adv_t.min()) - self.log("reward_batch_avg", train_batch[SampleBatch.REWARDS].mean()) - - def _compute_action_weights_and_logps( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> None: - # uses bin|exp to compute action weights - # 1(A>=0) or exp(A/temp) - - weight_type = self.config["weight_type"] - self._compute_adv_and_logps(model, dist_class, train_batch) - - if weight_type == "bin": - weights = (train_batch["advantages"] > 0.0).float() - elif weight_type == "exp": - temperature = self.config["temperature"] - max_weight = self.config["max_weight"] - weights = ( - (train_batch["advantages"] / temperature).exp().clamp(0.0, max_weight) - ) - else: - raise ValueError(f"invalid weight type: {weight_type}.") - - train_batch["action_weights"] = weights - - # logging - self.log("weights_avg", weights.mean()) - self.log("weights_max", weights.max()) - self.log("weights_min", weights.min()) - - def _compute_actor_loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - loss = -( - train_batch["action_weights"] * train_batch[SampleBatch.ACTION_LOGP] - ).mean(0) - return loss - - def _compute_critic_loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ): - discount = self.config["gamma"] - - # Compute bellman targets to regress on - # target, use target model to compute the target - target_model = cast(CRRModel, self.target_models[model]) - target_out_next, _ = target_model( - {SampleBatch.OBS: train_batch[SampleBatch.NEXT_OBS]} - ) - - # compute target values with no gradient - with torch.no_grad(): - # get the action of the current policy evaluated at the next state - pi_s_next = dist_class( - target_model.get_policy_output(target_out_next), target_model - ) - target_a_next = pi_s_next.sample() - if not self._is_action_discrete: - target_a_next = target_a_next.clamp( - torch.from_numpy(self.action_space.low).to(target_a_next), - torch.from_numpy(self.action_space.high).to(target_a_next), - ) - - # q1_target = target_model.get_q_values(target_out_next, target_a_next) - # q2_target = target_model.get_twin_q_values(target_out_next, target_a_next) - # target_q_next = torch.minimum(q1_target, q2_target).squeeze(-1) - target_q_next = self._get_q_value( - target_model, target_out_next, target_a_next - ).squeeze(-1) - - target = ( - train_batch[SampleBatch.REWARDS] - + discount - * (1.0 - train_batch[SampleBatch.TERMINATEDS].float()) - * target_q_next - ) - - # compute the predicted output - model = cast(CRRModel, model) - model_out_t, _ = model({SampleBatch.OBS: train_batch[SampleBatch.OBS]}) - q1 = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]).squeeze( - -1 - ) - q2 = model.get_twin_q_values( - model_out_t, train_batch[SampleBatch.ACTIONS] - ).squeeze(-1) - - # compute the MSE loss for all q-functions - td_error_q1 = q1 - target - td_error_q2 = q2 - target - loss_fn = l2_loss if self.config["td_error_loss_fn"] == "mse" else huber_loss - loss = torch.mean(loss_fn(torch.cat((td_error_q1, td_error_q2), dim=0))) - - # logging - self.log("td_error_q1", (td_error_q1**2).mean()) - self.log("td_error_q2", (td_error_q2**2).mean()) - self.log("td_error", loss) - self.log("targets_avg", target.mean()) - self.log("targets_max", target.max()) - self.log("targets_min", target.min()) - - return loss - - -if __name__ == "__main__": - - obs_space = gym.spaces.Box(np.array((-1, -1)), np.array((1, 1))) - act_space = gym.spaces.Box(np.array((-1, -1)), np.array((1, 1))) - config = AlgorithmConfig().framework(framework="torch").to_dict() - print(config["framework"]) - CRRTorchPolicy(obs_space, act_space, config=config) diff --git a/rllib_contrib/crr/tests/test_crr.py b/rllib_contrib/crr/tests/test_crr.py deleted file mode 100644 index 8bdd2419c19c5..0000000000000 --- a/rllib_contrib/crr/tests/test_crr.py +++ /dev/null @@ -1,87 +0,0 @@ -import unittest - -from rllib_crr.crr import CRRConfig - -import ray -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.test_utils import check_compute_single_action, check_train_results - -torch, _ = try_import_torch() - - -class TestCRR(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_crr_compilation(self): - """Test whether a CRR algorithm can be built with all supported frameworks.""" - - config = ( - CRRConfig() - .environment(env="Pendulum-v1", clip_actions=True) - .framework("torch") - .offline_data( - input_="dataset", - input_config={ - "format": "json", - "paths": [ - "s3://anonymous@air-example-data/rllib/pendulum/large.json" - ], - }, - actions_in_input_normalized=True, - ) - .training( - twin_q=True, - train_batch_size=256, - weight_type="bin", - advantage_type="mean", - n_action_sample=4, - target_network_update_freq=10000, - tau=1.0, - ) - .evaluation( - evaluation_interval=2, - evaluation_num_workers=2, - evaluation_duration=10, - evaluation_duration_unit="episodes", - evaluation_parallel_to_training=True, - evaluation_config=CRRConfig.overrides(input_="sampler", explore=False), - ) - .rollouts(num_rollout_workers=0) - ) - - num_iterations = 4 - - for _ in ["torch"]: - for loss_fn in ["mse", "huber"]: - config.td_error_loss_fn = loss_fn - algorithm = config.build() - # check if 4 iterations raises any errors - for i in range(num_iterations): - results = algorithm.train() - check_train_results(results) - print(results) - if (i + 1) % 2 == 0: - # evaluation happens every 2 iterations - eval_results = results["evaluation"] - print( - f"iter={algorithm.iteration} " - f"R={eval_results['episode_reward_mean']}" - ) - - check_compute_single_action(algorithm) - - algorithm.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/crr/tuned_examples/__init__.py b/rllib_contrib/crr/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/crr/tuned_examples/cartpole-v1-crr.yaml b/rllib_contrib/crr/tuned_examples/cartpole-v1-crr.yaml deleted file mode 100644 index 10f08c1ff570e..0000000000000 --- a/rllib_contrib/crr/tuned_examples/cartpole-v1-crr.yaml +++ /dev/null @@ -1,45 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole_crr: - env: CartPole-v1 - run: CRR - stop: - evaluation/sampler_results/episode_reward_mean: 200 - training_iteration: 100 - config: - input: dataset - input_config: - paths: s3://anonymous@air-example-data/rllib/cartpole/large.json - format: json - num_workers: 3 - framework: torch - gamma: 0.99 - train_batch_size: 2048 - critic_hidden_activation: tanh - critic_hiddens: [128, 128, 128] - critic_lr: 0.0003 - actor_hidden_activation: tanh - actor_hiddens: [128, 128, 128] - actor_lr: 0.0003 - actions_in_input_normalized: True - clip_actions: True - # Q function update setting - twin_q: True - target_network_update_freq: 1 - tau: 0.0005 - # evaluation - evaluation_config: - explore: False - input: sampler - evaluation_duration: 10 - evaluation_duration_unit: episodes - evaluation_interval: 1 - evaluation_num_workers: 1 - evaluation_parallel_to_training: True - # specific to CRR - temperature: 1.0 - weight_type: exp - advantage_type: mean - max_weight: 20.0 - n_action_sample: 4 diff --git a/rllib_contrib/crr/tuned_examples/cartpole-v1-crr_expectation.yaml b/rllib_contrib/crr/tuned_examples/cartpole-v1-crr_expectation.yaml deleted file mode 100644 index 56eeb0c1093b8..0000000000000 --- a/rllib_contrib/crr/tuned_examples/cartpole-v1-crr_expectation.yaml +++ /dev/null @@ -1,45 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole_crr: - env: CartPole-v1 - run: CRR - stop: - evaluation/sampler_results/episode_reward_mean: 200 - training_iteration: 100 - config: - input: dataset - input_config: - paths: s3://anonymous@air-example-data/rllib/cartpole/large.json - format: json - framework: torch - num_workers: 3 - gamma: 0.99 - train_batch_size: 2048 - critic_hidden_activation: tanh - critic_hiddens: [128, 128, 128] - critic_lr: 0.0003 - actor_hidden_activation: tanh - actor_hiddens: [128, 128, 128] - actor_lr: 0.0003 - actions_in_input_normalized: True - clip_actions: True - # Q function update setting - twin_q: True - target_network_update_freq: 1 - tau: 0.0005 - # evaluation - evaluation_config: - explore: False - input: sampler - evaluation_duration: 10 - evaluation_duration_unit: episodes - evaluation_interval: 1 - evaluation_num_workers: 1 - evaluation_parallel_to_training: True - # specific to CRR - temperature: 1.0 - weight_type: exp - advantage_type: expectation - max_weight: 20.0 - n_action_sample: 4 diff --git a/rllib_contrib/crr/tuned_examples/pendulum-v1-crr.yaml b/rllib_contrib/crr/tuned_examples/pendulum-v1-crr.yaml deleted file mode 100644 index 06625f4da2945..0000000000000 --- a/rllib_contrib/crr/tuned_examples/pendulum-v1-crr.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pendulum_crr: - env: Pendulum-v1 - run: CRR - stop: - # We could make this -200, but given that we have 4 cpus for our tests, we will have to settle for -300. - evaluation/sampler_results/episode_reward_mean: -300 - timesteps_total: 2000000 - config: - input: dataset - input_config: - # Get this from: https://github.com/ray-project/ray/raw/releases/2.5.1/rllib/tests/data/pendulum/pendulum_replay_v1.1.0.zip - paths: rllib_contrib/crr/tuned_examples/pendulum_replay_v1.1.0.zip - format: json - framework: torch - num_workers: 3 - gamma: 0.99 - train_batch_size: 1024 - critic_hidden_activation: relu - critic_hiddens: [256, 256] - critic_lr: 0.0003 - actor_hidden_activation: relu - actor_hiddens: [256, 256] - actor_lr: 0.0003 - actions_in_input_normalized: False - clip_actions: True - # Q function update setting - twin_q: True - target_network_update_freq: 1 - tau: 0.0001 - # evaluation - evaluation_config: - explore: False - input: sampler - evaluation_duration: 10 - evaluation_duration_unit: episodes - evaluation_interval: 1 - evaluation_num_workers: 1 - evaluation_parallel_to_training: True - # replay buffer - # specific to CRR - temperature: 1.0 - weight_type: exp - advantage_type: max - max_weight: 20.0 - n_action_sample: 4 diff --git a/rllib_contrib/ddpg/BUILD b/rllib_contrib/ddpg/BUILD deleted file mode 100644 index e09ad1a32dce5..0000000000000 --- a/rllib_contrib/ddpg/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# Examples - -py_test( - name = "example_ddpg_pendulum_v1", - main = "ddpg_pendulum_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/ddpg_pendulum_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_pendulum_ddpg_fake_gpus", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "no_tf_eager_tracing"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/pendulum-ddpg-fake-gpus.yaml"], - args = ["--dir=ddpg/tuned_examples/"] -) - -# Compilation Tests - -py_test( - name = "test_ddpg", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_ddpg.py"] -) diff --git a/rllib_contrib/ddpg/examples/ddpg_pendulum_v1.py b/rllib_contrib/ddpg/examples/ddpg_pendulum_v1.py deleted file mode 100644 index 2c39a159818fb..0000000000000 --- a/rllib_contrib/ddpg/examples/ddpg_pendulum_v1.py +++ /dev/null @@ -1,80 +0,0 @@ -import argparse - -from rllib_ddpg.ddpg import DDPG, DDPGConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - DDPGConfig() - .rollouts(num_rollout_workers=0, rollout_fragment_length=1) - .framework("torch") - .environment("Pendulum-v1", clip_rewards=False) - .training( - actor_hiddens=[64, 64], - critic_hiddens=[64, 64], - n_step=1, - model={}, - gamma=0.99, - replay_buffer_config={ - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 10000, - "worker_side_prioritization": False, - }, - num_steps_sampled_before_learning_starts=500, - actor_lr=1e-3, - critic_lr=1e-3, - use_huber=True, - huber_threshold=1.0, - l2_reg=1e-6, - train_batch_size=64, - target_network_update_freq=0, - ) - .reporting(min_sample_timesteps_per_iteration=600) - .exploration( - exploration_config={ - "type": "OrnsteinUhlenbeckNoise", - "scale_timesteps": 10000, - "initial_scale": 1.0, - "final_scale": 0.02, - "ou_base_scale": 0.1, - "ou_theta": 0.15, - "ou_sigma": 0.2, - } - ) - ) - - num_iterations = 100 - stop_reward = -320 - - tuner = tune.Tuner( - DDPG, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 30000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/ddpg/pyproject.toml b/rllib_contrib/ddpg/pyproject.toml deleted file mode 100644 index 93b6b0d18b36a..0000000000000 --- a/rllib_contrib/ddpg/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-ddpg" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium==0.26.3", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0"] diff --git a/rllib_contrib/ddpg/readme.md b/rllib_contrib/ddpg/readme.md deleted file mode 100644 index 0738c1d0d0ad4..0000000000000 --- a/rllib_contrib/ddpg/readme.md +++ /dev/null @@ -1,17 +0,0 @@ -# DDPG (Deep Deterministic Policy Gradient) - -[DDPG](https://arxiv.org/abs/1509.02971) is an actor-critic, model-free algorithm based on the deterministic policy gradient that can operate over continuous action spaces. - - -## Installation - -``` -conda create -n rllib-ddpg python=3.10 -conda activate rllib-ddpg -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[DDPG Example](examples/ddpg_pendulum_v1.py) \ No newline at end of file diff --git a/rllib_contrib/ddpg/requirements.txt b/rllib_contrib/ddpg/requirements.txt deleted file mode 100644 index 7d979fee79740..0000000000000 --- a/rllib_contrib/ddpg/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -tensorflow==2.13.0 -tensorflow-probability==0.20.1 -torch==1.13.1 diff --git a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/__init__.py b/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/__init__.py deleted file mode 100644 index e2010e3e42d98..0000000000000 --- a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from rllib_ddpg.ddpg.ddpg import DDPG, DDPGConfig - -from ray.tune.registry import register_trainable - -__all__ = ["DDPGConfig", "DDPG"] - -register_trainable("rllib-contrib-ddpg", DDPG) diff --git a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg.py b/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg.py deleted file mode 100644 index 4c82041dbaca7..0000000000000 --- a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg.py +++ /dev/null @@ -1,313 +0,0 @@ -import logging -from typing import List, Optional, Type - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.simple_q.simple_q import SimpleQ, SimpleQConfig -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE - -logger = logging.getLogger(__name__) - - -class DDPGConfig(SimpleQConfig): - """Defines a configuration class from which a DDPG Trainer can be built. - - Example: - >>> from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig - >>> config = DDPGConfig().training(lr=0.01).resources(num_gpus=1) - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Trainer object from the config and run one training iteration. - >>> algo = config.build(env="Pendulum-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig - >>> from ray import air - >>> from ray import tune - >>> config = DDPGConfig() - >>> # Print out some default values. - >>> print(config.lr) # doctest: +SKIP - 0.0004 - >>> # Update the config object. - >>> config = config.training(lr=tune.grid_search([0.001, 0.0001])) - >>> # Set the config object's env. - >>> config = config.environment(env="Pendulum-v1") - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "DDPG", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a DDPGConfig instance.""" - super().__init__(algo_class=algo_class or DDPG) - - # fmt: off - # __sphinx_doc_begin__ - # DDPG-specific settings. - self.twin_q = False - self.policy_delay = 1 - self.smooth_target_policy = False - self.target_noise = 0.2 - self.target_noise_clip = 0.5 - - self.use_state_preprocessor = False - self.actor_hiddens = [400, 300] - self.actor_hidden_activation = "relu" - self.critic_hiddens = [400, 300] - self.critic_hidden_activation = "relu" - self.n_step = 1 - - self.training_intensity = None - self.critic_lr = 1e-3 - self.actor_lr = 1e-3 - self.tau = 0.002 - self.use_huber = False - self.huber_threshold = 1.0 - self.l2_reg = 1e-6 - - # Override some of SimpleQ's default values with DDPG-specific values. - # .exploration() - self.exploration_config = { - # DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output - # actions (after a possible pure random phase of n timesteps). - "type": "OrnsteinUhlenbeckNoise", - # For how many timesteps should we return completely random actions, - # before we start adding (scaled) noise? - "random_timesteps": 1000, - # The OU-base scaling factor to always apply to action-added noise. - "ou_base_scale": 0.1, - # The OU theta param. - "ou_theta": 0.15, - # The OU sigma param. - "ou_sigma": 0.2, - # The initial noise scaling factor. - "initial_scale": 1.0, - # The final noise scaling factor. - "final_scale": 0.02, - # Timesteps over which to anneal scale (from initial to final values). - "scale_timesteps": 10000, - } - - # Common DDPG buffer parameters. - self.replay_buffer_config = { - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 50000, - # Specify prioritized replay by supplying a buffer type that supports - # prioritization, for example: MultiAgentPrioritizedReplayBuffer. - "prioritized_replay": DEPRECATED_VALUE, - # Alpha parameter for prioritized replay buffer. - "prioritized_replay_alpha": 0.6, - # Beta parameter for sampling from prioritized replay buffer. - "prioritized_replay_beta": 0.4, - # Epsilon to add to the TD errors when updating priorities. - "prioritized_replay_eps": 1e-6, - # Whether to compute priorities on workers. - "worker_side_prioritization": False, - } - - # .training() - self.grad_clip = None - self.train_batch_size = 256 - self.target_network_update_freq = 0 - # Number of timesteps to collect from rollout workers before we start - # sampling from replay buffers for learning. Whether we count this in agent - # steps or environment steps depends on config.multi_agent(count_steps_by=..). - self.num_steps_sampled_before_learning_starts = 1500 - - # .rollouts() - self.rollout_fragment_length = "auto" - self.compress_observations = False - - # __sphinx_doc_end__ - # fmt: on - - # Deprecated. - self.worker_side_prioritization = DEPRECATED_VALUE - - @override(AlgorithmConfig) - def training( - self, - *, - twin_q: Optional[bool] = NotProvided, - policy_delay: Optional[int] = NotProvided, - smooth_target_policy: Optional[bool] = NotProvided, - target_noise: Optional[bool] = NotProvided, - target_noise_clip: Optional[float] = NotProvided, - use_state_preprocessor: Optional[bool] = NotProvided, - actor_hiddens: Optional[List[int]] = NotProvided, - actor_hidden_activation: Optional[str] = NotProvided, - critic_hiddens: Optional[List[int]] = NotProvided, - critic_hidden_activation: Optional[str] = NotProvided, - n_step: Optional[int] = NotProvided, - critic_lr: Optional[float] = NotProvided, - actor_lr: Optional[float] = NotProvided, - tau: Optional[float] = NotProvided, - use_huber: Optional[bool] = NotProvided, - huber_threshold: Optional[float] = NotProvided, - l2_reg: Optional[float] = NotProvided, - training_intensity: Optional[float] = NotProvided, - **kwargs, - ) -> "DDPGConfig": - """Sets the training related configuration. - - === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks === - TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html - In addition to settings below, you can use "exploration_noise_type" and - "exploration_gauss_act_noise" to get IID Gaussian exploration noise - instead of OrnsteinUhlenbeck exploration noise. - - Args: - twin_q: Use twin Q-net. - policy_delay: Delayed policy update. - smooth_target_policy: Target policy smoothing (this also replaces - OrnsteinUhlenbeck exploration noise with IID Gaussian exploration - noise, for now). - target_noise: Gaussian stddev of target action noise for smoothing. - target_noise_clip: Target noise limit (bound). - use_state_preprocessor: Apply a state preprocessor with spec given by the - "model" config option - (like other RL algorithms). This is mostly useful if you have a weird - observation shape, like an image. Disabled by default. - actor_hiddens: Postprocess the policy network model output with these - hidden layers. If use_state_preprocessor is False, then these will - be the *only* hidden layers in the network. - actor_hidden_activation: Hidden layers activation of the postprocessing - stage of the policy network - critic_hiddens: Postprocess the critic network model output with these - hidden layers; again, if use_state_preprocessor is True, then the - state will be preprocessed by the model specified with the "model" - config option first. - critic_hidden_activation: Hidden layers activation of the postprocessing - state of the critic. - n_step: N-step Q learning - critic_lr: Learning rate for the critic (Q-function) optimizer. - actor_lr: Learning rate for the actor (policy) optimizer. - tau: Update the target by \tau * policy + (1-\tau) * target_policy - use_huber: Conventionally, no need to clip gradients if using a huber loss - huber_threshold: Threshold of a huber loss - l2_reg: Weights for L2 regularization - training_intensity: The intensity with which to update the model - (vs collecting samples from - the env). If None, uses the "natural" value of: - `train_batch_size` / (`rollout_fragment_length` x `num_workers` x - `num_envs_per_worker`). - If provided, will make sure that the ratio between ts inserted into and - sampled from the buffer matches the given value. - Example: - training_intensity=1000.0 - train_batch_size=250 rollout_fragment_length=1 - num_workers=1 (or 0) num_envs_per_worker=1 - -> natural value = 250 / 1 = 250.0 - -> will make sure that replay+train op will be executed 4x as - often as rollout+insert op (4 * 250 = 1000). - See: rllib/algorithms/dqn/dqn.py::calculate_rr_weights for further - details. - - Returns: - This updated DDPGConfig object. - """ - super().training(**kwargs) - - if twin_q is not NotProvided: - self.twin_q = twin_q - if policy_delay is not NotProvided: - self.policy_delay = policy_delay - if smooth_target_policy is not NotProvided: - self.smooth_target_policy = smooth_target_policy - if target_noise is not NotProvided: - self.target_noise = target_noise - if target_noise_clip is not NotProvided: - self.target_noise_clip = target_noise_clip - if use_state_preprocessor is not NotProvided: - self.use_state_preprocessor = use_state_preprocessor - if actor_hiddens is not NotProvided: - self.actor_hiddens = actor_hiddens - if actor_hidden_activation is not NotProvided: - self.actor_hidden_activation = actor_hidden_activation - if critic_hiddens is not NotProvided: - self.critic_hiddens = critic_hiddens - if critic_hidden_activation is not NotProvided: - self.critic_hidden_activation = critic_hidden_activation - if n_step is not NotProvided: - self.n_step = n_step - if critic_lr is not NotProvided: - self.critic_lr = critic_lr - if actor_lr is not NotProvided: - self.actor_lr = actor_lr - if tau is not NotProvided: - self.tau = tau - if use_huber is not NotProvided: - self.use_huber = use_huber - if huber_threshold is not NotProvided: - self.huber_threshold = huber_threshold - if l2_reg is not NotProvided: - self.l2_reg = l2_reg - if training_intensity is not NotProvided: - self.training_intensity = training_intensity - - return self - - @override(SimpleQConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - # Check rollout_fragment_length to be compatible with n_step. - if ( - not self.in_evaluation - and self.rollout_fragment_length != "auto" - and self.rollout_fragment_length < self.n_step - ): - raise ValueError( - f"Your `rollout_fragment_length` ({self.rollout_fragment_length}) is " - f"smaller than `n_step` ({self.n_step})! " - f"Try setting config.rollouts(rollout_fragment_length={self.n_step})." - ) - - if self.grad_clip is not None and self.grad_clip <= 0.0: - raise ValueError("`grad_clip` value must be > 0.0!") - - if self.exploration_config["type"] == "ParameterNoise": - if self.batch_mode != "complete_episodes": - raise ValueError( - "ParameterNoise Exploration requires `batch_mode` to be " - "'complete_episodes'. Try seting " - "config.training(batch_mode='complete_episodes')." - ) - - def get_rollout_fragment_length(self, worker_index: int = 0) -> int: - if self.rollout_fragment_length == "auto": - return self.n_step - else: - return self.rollout_fragment_length - - -class DDPG(SimpleQ): - @classmethod - @override(SimpleQ) - # TODO make this return a AlgorithmConfig - def get_default_config(cls) -> AlgorithmConfig: - return DDPGConfig() - - @classmethod - @override(SimpleQ) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - from ray.rllib.algorithms.ddpg.ddpg_torch_policy import DDPGTorchPolicy - - return DDPGTorchPolicy - elif config["framework"] == "tf": - from ray.rllib.algorithms.ddpg.ddpg_tf_policy import DDPGTF1Policy - - return DDPGTF1Policy - else: - from ray.rllib.algorithms.ddpg.ddpg_tf_policy import DDPGTF2Policy - - return DDPGTF2Policy diff --git a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_tf_model.py b/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_tf_model.py deleted file mode 100644 index 0e0577079cf80..0000000000000 --- a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_tf_model.py +++ /dev/null @@ -1,211 +0,0 @@ -from typing import List, Optional - -import gymnasium as gym -import numpy as np - -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.typing import ModelConfigDict, TensorType - -tf1, tf, tfv = try_import_tf() - - -class DDPGTFModel(TFModelV2): - """Extension of standard TFModel to provide DDPG action- and q-outputs. - - Data flow: - obs -> forward() -> model_out - model_out -> get_policy_output() -> deterministic actions - model_out, actions -> get_q_values() -> Q(s, a) - model_out, actions -> get_twin_q_values() -> Q_twin(s, a) - - Note that this class by itself is not a valid model unless you - implement forward() in a subclass.""" - - def __init__( - self, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - num_outputs: int, - model_config: ModelConfigDict, - name: str, - # Extra DDPGActionModel args: - actor_hiddens: Optional[List[int]] = None, - actor_hidden_activation: str = "relu", - critic_hiddens: Optional[List[int]] = None, - critic_hidden_activation: str = "relu", - twin_q: bool = False, - add_layer_norm: bool = False, - ): - """Initialize variables of this model. - - Extra model kwargs: - actor_hiddens: Defines size of hidden layers for the DDPG - policy head. - These will be used to postprocess the model output for the - purposes of computing deterministic actions. - - Note that the core layers for forward() are not defined here, this - only defines the layers for the DDPG head. Those layers for forward() - should be defined in subclasses of DDPGActionModel. - """ - - if actor_hiddens is None: - actor_hiddens = [256, 256] - - if critic_hiddens is None: - critic_hiddens = [256, 256] - - super(DDPGTFModel, self).__init__( - obs_space, action_space, num_outputs, model_config, name - ) - - actor_hidden_activation = getattr(tf.nn, actor_hidden_activation, tf.nn.relu) - critic_hidden_activation = getattr(tf.nn, critic_hidden_activation, tf.nn.relu) - - self.model_out = tf.keras.layers.Input(shape=(num_outputs,), name="model_out") - self.bounded = np.logical_and( - action_space.bounded_above, action_space.bounded_below - ).any() - self.action_dim = action_space.shape[0] - - if actor_hiddens: - last_layer = self.model_out - for i, n in enumerate(actor_hiddens): - last_layer = tf.keras.layers.Dense( - n, - name="actor_hidden_{}".format(i), - activation=actor_hidden_activation, - )(last_layer) - if add_layer_norm: - last_layer = tf.keras.layers.LayerNormalization( - name="LayerNorm_{}".format(i) - )(last_layer) - actor_out = tf.keras.layers.Dense( - self.action_dim, activation=None, name="actor_out" - )(last_layer) - else: - actor_out = self.model_out - - # Use sigmoid to scale to [0,1], but also double magnitude of input to - # emulate behaviour of tanh activation used in DDPG and TD3 papers. - # After sigmoid squashing, re-scale to env action space bounds. - def lambda_(x): - action_range = (action_space.high - action_space.low)[None] - low_action = action_space.low[None] - sigmoid_out = tf.nn.sigmoid(2 * x) - squashed = action_range * sigmoid_out + low_action - return squashed - - # Only squash if we have bounded actions. - if self.bounded: - actor_out = tf.keras.layers.Lambda(lambda_)(actor_out) - - self.policy_model = tf.keras.Model(self.model_out, actor_out) - - # Build the Q-model(s). - self.actions_input = tf.keras.layers.Input( - shape=(self.action_dim,), name="actions" - ) - - def build_q_net(name, observations, actions): - # For continuous actions: Feed obs and actions (concatenated) - # through the NN. - q_net = tf.keras.Sequential( - [ - tf.keras.layers.Concatenate(axis=1), - ] - + [ - tf.keras.layers.Dense( - units=units, - activation=critic_hidden_activation, - name="{}_hidden_{}".format(name, i), - ) - for i, units in enumerate(critic_hiddens) - ] - + [ - tf.keras.layers.Dense( - units=1, activation=None, name="{}_out".format(name) - ) - ] - ) - - q_net = tf.keras.Model( - [observations, actions], q_net([observations, actions]) - ) - return q_net - - self.q_model = build_q_net("q", self.model_out, self.actions_input) - - if twin_q: - self.twin_q_model = build_q_net( - "twin_q", self.model_out, self.actions_input - ) - else: - self.twin_q_model = None - - def get_q_values(self, model_out: TensorType, actions: TensorType) -> TensorType: - """Return the Q estimates for the most recent forward pass. - - This implements Q(s, a). - - Args: - model_out: obs embeddings from the model layers, of shape - [BATCH_SIZE, num_outputs]. - actions: Actions to return the Q-values for. - Shape: [BATCH_SIZE, action_dim]. - - Returns: - tensor of shape [BATCH_SIZE]. - """ - if actions is not None: - return self.q_model([model_out, actions]) - else: - return self.q_model(model_out) - - def get_twin_q_values( - self, model_out: TensorType, actions: TensorType - ) -> TensorType: - """Same as get_q_values but using the twin Q net. - - This implements the twin Q(s, a). - - Args: - model_out: obs embeddings from the model layers, of shape - [BATCH_SIZE, num_outputs]. - actions: Actions to return the Q-values for. - Shape: [BATCH_SIZE, action_dim]. - - Returns: - tensor of shape [BATCH_SIZE]. - """ - if actions is not None: - return self.twin_q_model([model_out, actions]) - else: - return self.twin_q_model(model_out) - - def get_policy_output(self, model_out: TensorType) -> TensorType: - """Return the action output for the most recent forward pass. - - This outputs the support for pi(s). For continuous action spaces, this - is the action directly. - - Args: - model_out: obs embeddings from the model layers, of shape - [BATCH_SIZE, num_outputs]. - - Returns: - tensor of shape [BATCH_SIZE, action_out_size] - """ - return self.policy_model(model_out) - - def policy_variables(self) -> List[TensorType]: - """Return the list of variables for the policy net.""" - return list(self.policy_model.variables) - - def q_variables(self) -> List[TensorType]: - """Return the list of variables for Q / twin Q nets.""" - - return self.q_model.variables + ( - self.twin_q_model.variables if self.twin_q_model else [] - ) diff --git a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_tf_policy.py b/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_tf_policy.py deleted file mode 100644 index 8cd1ecee6d07d..0000000000000 --- a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_tf_policy.py +++ /dev/null @@ -1,415 +0,0 @@ -import logging -from functools import partial -from typing import Any, Dict, List, Optional, Tuple, Type, Union - -import gymnasium as gym -from rllib_ddpg.ddpg.utils import make_ddpg_models, validate_spaces - -from ray.rllib.algorithms.dqn.dqn_tf_policy import ( - PRIO_WEIGHTS, - postprocess_nstep_and_prio, -) -from ray.rllib.evaluation import Episode -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_action_dist import ( - Deterministic, - Dirichlet, - TFActionDistribution, -) -from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import TargetNetworkMixin -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import get_variable, try_import_tf -from ray.rllib.utils.spaces.simplex import Simplex -from ray.rllib.utils.tf_utils import huber_loss, make_tf_callable -from ray.rllib.utils.typing import ( - AlgorithmConfigDict, - LocalOptimizer, - ModelGradients, - TensorType, -) -from ray.util.debug import log_once - -tf1, tf, tfv = try_import_tf() - -logger = logging.getLogger(__name__) - - -class ComputeTDErrorMixin: - def __init__(self: Union[DynamicTFPolicyV2, EagerTFPolicyV2]): - @make_tf_callable(self.get_session(), dynamic_shape=True) - def compute_td_error( - obs_t, act_t, rew_t, obs_tp1, terminateds_mask, importance_weights - ): - input_dict = SampleBatch( - { - SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_t), - SampleBatch.ACTIONS: tf.convert_to_tensor(act_t), - SampleBatch.REWARDS: tf.convert_to_tensor(rew_t), - SampleBatch.NEXT_OBS: tf.convert_to_tensor(obs_tp1), - SampleBatch.TERMINATEDS: tf.convert_to_tensor(terminateds_mask), - PRIO_WEIGHTS: tf.convert_to_tensor(importance_weights), - } - ) - # Do forward pass on loss to update td errors attribute - # (one TD-error value per item in batch to update PR weights). - self.loss(self.model, None, input_dict) - # `self.td_error` is set in loss_fn. - return self.td_error - - self.compute_td_error = compute_td_error - - -# We need this builder function because we want to share the same -# custom logics between TF1 dynamic and TF2 eager policies. -def get_ddpg_tf_policy( - name: str, base: Type[Union[DynamicTFPolicyV2, EagerTFPolicyV2]] -) -> Type: - """Construct a DDPGTFPolicy inheriting either dynamic or eager base policies. - - Args: - base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. - Returns: - A TF Policy to be used with DDPG. - """ - - class DDPGTFPolicy(TargetNetworkMixin, ComputeTDErrorMixin, base): - def __init__( - self, - observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, - *, - existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None, - existing_model: Optional[ModelV2] = None, - ): - # First thing first, enable eager execution if necessary. - base.enable_eager_execution_if_necessary() - - # Validate action space for DDPG - validate_spaces(self, observation_space, action_space) - - base.__init__( - self, - observation_space, - action_space, - config, - existing_inputs=existing_inputs, - existing_model=existing_model, - ) - - ComputeTDErrorMixin.__init__(self) - - self.maybe_initialize_optimizer_and_loss() - - TargetNetworkMixin.__init__(self) - - @override(base) - def make_model(self) -> ModelV2: - return make_ddpg_models(self) - - @override(base) - def optimizer( - self, - ) -> List["tf.keras.optimizers.Optimizer"]: - """Create separate optimizers for actor & critic losses.""" - if self.config["framework"] == "tf2": - self.global_step = get_variable(0, tf_name="global_step") - self._actor_optimizer = tf.keras.optimizers.Adam( - learning_rate=self.config["actor_lr"] - ) - self._critic_optimizer = tf.keras.optimizers.Adam( - learning_rate=self.config["critic_lr"] - ) - # Static graph mode. - else: - self.global_step = tf1.train.get_or_create_global_step() - self._actor_optimizer = tf1.train.AdamOptimizer( - learning_rate=self.config["actor_lr"] - ) - self._critic_optimizer = tf1.train.AdamOptimizer( - learning_rate=self.config["critic_lr"] - ) - return [self._actor_optimizer, self._critic_optimizer] - - @override(base) - def compute_gradients_fn( - self, optimizer: LocalOptimizer, loss: TensorType - ) -> ModelGradients: - if self.config["framework"] == "tf2": - tape = optimizer.tape - pol_weights = self.model.policy_variables() - actor_grads_and_vars = list( - zip(tape.gradient(self.actor_loss, pol_weights), pol_weights) - ) - q_weights = self.model.q_variables() - critic_grads_and_vars = list( - zip(tape.gradient(self.critic_loss, q_weights), q_weights) - ) - else: - actor_grads_and_vars = self._actor_optimizer.compute_gradients( - self.actor_loss, var_list=self.model.policy_variables() - ) - critic_grads_and_vars = self._critic_optimizer.compute_gradients( - self.critic_loss, var_list=self.model.q_variables() - ) - - # Clip if necessary. - if self.config["grad_clip"]: - clip_func = partial(tf.clip_by_norm, clip_norm=self.config["grad_clip"]) - else: - clip_func = tf.identity - - # Save grads and vars for later use in `build_apply_op`. - self._actor_grads_and_vars = [ - (clip_func(g), v) for (g, v) in actor_grads_and_vars if g is not None - ] - self._critic_grads_and_vars = [ - (clip_func(g), v) for (g, v) in critic_grads_and_vars if g is not None - ] - - grads_and_vars = self._actor_grads_and_vars + self._critic_grads_and_vars - - return grads_and_vars - - @override(base) - def apply_gradients_fn( - self, - optimizer: "tf.keras.optimizers.Optimizer", - grads: ModelGradients, - ) -> "tf.Operation": - # For policy gradient, update policy net one time v.s. - # update critic net `policy_delay` time(s). - should_apply_actor_opt = tf.equal( - tf.math.floormod(self.global_step, self.config["policy_delay"]), 0 - ) - - def make_apply_op(): - return self._actor_optimizer.apply_gradients(self._actor_grads_and_vars) - - actor_op = tf.cond( - should_apply_actor_opt, - true_fn=make_apply_op, - false_fn=lambda: tf.constant(0, dtype=tf.int64), # this is a no-op - ) - critic_op = self._critic_optimizer.apply_gradients( - self._critic_grads_and_vars - ) - # Increment global step & apply ops. - if self.config["framework"] == "tf2": - self.global_step.assign_add(1) - return tf.no_op() - else: - with tf1.control_dependencies([tf1.assign_add(self.global_step, 1)]): - return tf.group(actor_op, critic_op) - - @override(base) - def action_distribution_fn( - self, - model: ModelV2, - *, - obs_batch: TensorType, - state_batches: TensorType, - is_training: bool = False, - **kwargs, - ) -> Tuple[TensorType, type, List[TensorType]]: - model_out, _ = model(SampleBatch(obs=obs_batch, _is_training=is_training)) - dist_inputs = model.get_policy_output(model_out) - - if isinstance(self.action_space, Simplex): - distr_class = Dirichlet - else: - distr_class = Deterministic - return dist_inputs, distr_class, [] # []=state out - - @override(base) - def postprocess_trajectory( - self, - sample_batch: SampleBatch, - other_agent_batches: Optional[Dict[Any, SampleBatch]] = None, - episode: Optional[Episode] = None, - ) -> SampleBatch: - return postprocess_nstep_and_prio( - self, sample_batch, other_agent_batches, episode - ) - - @override(base) - def loss( - self, - model: Union[ModelV2, "tf.keras.Model"], - dist_class: Type[TFActionDistribution], - train_batch: SampleBatch, - ) -> TensorType: - twin_q = self.config["twin_q"] - gamma = self.config["gamma"] - n_step = self.config["n_step"] - use_huber = self.config["use_huber"] - huber_threshold = self.config["huber_threshold"] - l2_reg = self.config["l2_reg"] - - input_dict = SampleBatch( - obs=train_batch[SampleBatch.CUR_OBS], _is_training=True - ) - input_dict_next = SampleBatch( - obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True - ) - - model_out_t, _ = model(input_dict, [], None) - model_out_tp1, _ = model(input_dict_next, [], None) - target_model_out_tp1, _ = self.target_model(input_dict_next, [], None) - - self._target_q_func_vars = self.target_model.variables() - - # Policy network evaluation. - policy_t = model.get_policy_output(model_out_t) - policy_tp1 = self.target_model.get_policy_output(target_model_out_tp1) - - # Action outputs. - if self.config["smooth_target_policy"]: - target_noise_clip = self.config["target_noise_clip"] - clipped_normal_sample = tf.clip_by_value( - tf.random.normal( - tf.shape(policy_tp1), stddev=self.config["target_noise"] - ), - -target_noise_clip, - target_noise_clip, - ) - policy_tp1_smoothed = tf.clip_by_value( - policy_tp1 + clipped_normal_sample, - self.action_space.low * tf.ones_like(policy_tp1), - self.action_space.high * tf.ones_like(policy_tp1), - ) - else: - # No smoothing, just use deterministic actions. - policy_tp1_smoothed = policy_tp1 - - # Q-net(s) evaluation. - # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - # Q-values for given actions & observations in given current - q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) - - # Q-values for current policy (no noise) in given current state - q_t_det_policy = model.get_q_values(model_out_t, policy_t) - - if twin_q: - twin_q_t = model.get_twin_q_values( - model_out_t, train_batch[SampleBatch.ACTIONS] - ) - - # Target q-net(s) evaluation. - q_tp1 = self.target_model.get_q_values( - target_model_out_tp1, policy_tp1_smoothed - ) - - if twin_q: - twin_q_tp1 = self.target_model.get_twin_q_values( - target_model_out_tp1, policy_tp1_smoothed - ) - - q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) - if twin_q: - twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) - q_tp1 = tf.minimum(q_tp1, twin_q_tp1) - - q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) - q_tp1_best_masked = ( - 1.0 - tf.cast(train_batch[SampleBatch.TERMINATEDS], tf.float32) - ) * q_tp1_best - - # Compute RHS of bellman equation. - q_t_selected_target = tf.stop_gradient( - tf.cast(train_batch[SampleBatch.REWARDS], tf.float32) - + gamma**n_step * q_tp1_best_masked - ) - - # Compute the error (potentially clipped). - if twin_q: - td_error = q_t_selected - q_t_selected_target - twin_td_error = twin_q_t_selected - q_t_selected_target - if use_huber: - errors = huber_loss(td_error, huber_threshold) + huber_loss( - twin_td_error, huber_threshold - ) - else: - errors = 0.5 * tf.math.square(td_error) + 0.5 * tf.math.square( - twin_td_error - ) - else: - td_error = q_t_selected - q_t_selected_target - if use_huber: - errors = huber_loss(td_error, huber_threshold) - else: - errors = 0.5 * tf.math.square(td_error) - - critic_loss = tf.reduce_mean( - tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) * errors - ) - actor_loss = -tf.reduce_mean(q_t_det_policy) - - # Add l2-regularization if required. - if l2_reg is not None: - for var in self.model.policy_variables(): - if "bias" not in var.name: - actor_loss += l2_reg * tf.nn.l2_loss(var) - for var in self.model.q_variables(): - if "bias" not in var.name: - critic_loss += l2_reg * tf.nn.l2_loss(var) - - # Model self-supervised losses. - if self.config["use_state_preprocessor"]: - # Expand input_dict in case custom_loss' need them. - input_dict[SampleBatch.ACTIONS] = train_batch[SampleBatch.ACTIONS] - input_dict[SampleBatch.REWARDS] = train_batch[SampleBatch.REWARDS] - input_dict[SampleBatch.TERMINATEDS] = train_batch[ - SampleBatch.TERMINATEDS - ] - input_dict[SampleBatch.NEXT_OBS] = train_batch[SampleBatch.NEXT_OBS] - if log_once("ddpg_custom_loss"): - logger.warning( - "You are using a state-preprocessor with DDPG and " - "therefore, `custom_loss` will be called on your Model! " - "Please be aware that DDPG now uses the ModelV2 API, which " - "merges all previously separate sub-models (policy_model, " - "q_model, and twin_q_model) into one ModelV2, on which " - "`custom_loss` is called, passing it " - "[actor_loss, critic_loss] as 1st argument. " - "You may have to change your custom loss function to handle " - "this." - ) - [actor_loss, critic_loss] = model.custom_loss( - [actor_loss, critic_loss], input_dict - ) - - # Store values for stats function. - self.actor_loss = actor_loss - self.critic_loss = critic_loss - self.td_error = td_error - self.q_t = q_t - - # Return one loss value (even though we treat them separately in our - # 2 optimizers: actor and critic). - return self.critic_loss + self.actor_loss - - @override(base) - def extra_learn_fetches_fn(self) -> Dict[str, Any]: - return {"td_error": self.td_error} - - @override(base) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - stats = { - "mean_q": tf.reduce_mean(self.q_t), - "max_q": tf.reduce_max(self.q_t), - "min_q": tf.reduce_min(self.q_t), - } - return stats - - DDPGTFPolicy.__name__ = name - DDPGTFPolicy.__qualname__ = name - - return DDPGTFPolicy - - -DDPGTF1Policy = get_ddpg_tf_policy("DDPGTF1Policy", DynamicTFPolicyV2) -DDPGTF2Policy = get_ddpg_tf_policy("DDPGTF2Policy", EagerTFPolicyV2) diff --git a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_torch_model.py b/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_torch_model.py deleted file mode 100644 index d3fe93ab4b3bf..0000000000000 --- a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_torch_model.py +++ /dev/null @@ -1,236 +0,0 @@ -from typing import Dict, List, Optional, Union - -import gymnasium as gym -import numpy as np - -from ray.rllib.models.torch.misc import SlimFC -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.models.utils import get_activation_fn -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.typing import ModelConfigDict, TensorType - -torch, nn = try_import_torch() - - -class DDPGTorchModel(TorchModelV2, nn.Module): - """Extension of standard TorchModelV2 for DDPG. - - Data flow: - obs -> forward() -> model_out - model_out -> get_policy_output() -> pi(s) - model_out, actions -> get_q_values() -> Q(s, a) - model_out, actions -> get_twin_q_values() -> Q_twin(s, a) - - Note that this class by itself is not a valid model unless you - implement forward() in a subclass.""" - - def __init__( - self, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - num_outputs: int, - model_config: ModelConfigDict, - name: str, - # Extra DDPGActionModel args: - actor_hiddens: Optional[List[int]] = None, - actor_hidden_activation: str = "relu", - critic_hiddens: Optional[List[int]] = None, - critic_hidden_activation: str = "relu", - twin_q: bool = False, - add_layer_norm: bool = False, - ): - """Initialize variables of this model. - - Extra model kwargs: - actor_hidden_activation: activation for actor network - actor_hiddens: hidden layers sizes for actor network - critic_hidden_activation: activation for critic network - critic_hiddens: hidden layers sizes for critic network - twin_q: build twin Q networks. - add_layer_norm: Enable layer norm (for param noise). - - Note that the core layers for forward() are not defined here, this - only defines the layers for the output heads. Those layers for - forward() should be defined in subclasses of DDPGTorchModel. - """ - if actor_hiddens is None: - actor_hiddens = [256, 256] - - if critic_hiddens is None: - critic_hiddens = [256, 256] - - nn.Module.__init__(self) - super(DDPGTorchModel, self).__init__( - obs_space, action_space, num_outputs, model_config, name - ) - - self.bounded = np.logical_and( - self.action_space.bounded_above, self.action_space.bounded_below - ).any() - self.action_dim = np.prod(self.action_space.shape) - - # Build the policy network. - self.policy_model = nn.Sequential() - ins = num_outputs - self.obs_ins = ins - activation = get_activation_fn(actor_hidden_activation, framework="torch") - for i, n in enumerate(actor_hiddens): - self.policy_model.add_module( - "action_{}".format(i), - SlimFC( - ins, - n, - initializer=torch.nn.init.xavier_uniform_, - activation_fn=activation, - ), - ) - # Add LayerNorm after each Dense. - if add_layer_norm: - self.policy_model.add_module( - "LayerNorm_A_{}".format(i), nn.LayerNorm(n) - ) - ins = n - - self.policy_model.add_module( - "action_out", - SlimFC( - ins, - self.action_dim, - initializer=torch.nn.init.xavier_uniform_, - activation_fn=None, - ), - ) - - # Use sigmoid to scale to [0,1], but also double magnitude of input to - # emulate behaviour of tanh activation used in DDPG and TD3 papers. - # After sigmoid squashing, re-scale to env action space bounds. - class _Lambda(nn.Module): - def __init__(self_): - super().__init__() - low_action = nn.Parameter( - torch.from_numpy(self.action_space.low).float() - ) - low_action.requires_grad = False - self_.register_parameter("low_action", low_action) - action_range = nn.Parameter( - torch.from_numpy( - self.action_space.high - self.action_space.low - ).float() - ) - action_range.requires_grad = False - self_.register_parameter("action_range", action_range) - - def forward(self_, x): - sigmoid_out = nn.Sigmoid()(2.0 * x) - squashed = self_.action_range * sigmoid_out + self_.low_action - return squashed - - # Only squash if we have bounded actions. - if self.bounded: - self.policy_model.add_module("action_out_squashed", _Lambda()) - - # Build the Q-net(s), including target Q-net(s). - def build_q_net(name_): - activation = get_activation_fn(critic_hidden_activation, framework="torch") - # For continuous actions: Feed obs and actions (concatenated) - # through the NN. For discrete actions, only obs. - q_net = nn.Sequential() - ins = self.obs_ins + self.action_dim - for i, n in enumerate(critic_hiddens): - q_net.add_module( - "{}_hidden_{}".format(name_, i), - SlimFC( - ins, - n, - initializer=torch.nn.init.xavier_uniform_, - activation_fn=activation, - ), - ) - ins = n - - q_net.add_module( - "{}_out".format(name_), - SlimFC( - ins, - 1, - initializer=torch.nn.init.xavier_uniform_, - activation_fn=None, - ), - ) - return q_net - - self.q_model = build_q_net("q") - if twin_q: - self.twin_q_model = build_q_net("twin_q") - else: - self.twin_q_model = None - - def get_q_values(self, model_out: TensorType, actions: TensorType) -> TensorType: - """Return the Q estimates for the most recent forward pass. - - This implements Q(s, a). - - Args: - model_out: obs embeddings from the model layers, of shape - [BATCH_SIZE, num_outputs]. - actions: Actions to return the Q-values for. - Shape: [BATCH_SIZE, action_dim]. - - Returns: - tensor of shape [BATCH_SIZE]. - """ - return self.q_model(torch.cat([model_out, actions], -1)) - - def get_twin_q_values( - self, model_out: TensorType, actions: TensorType - ) -> TensorType: - """Same as get_q_values but using the twin Q net. - - This implements the twin Q(s, a). - - Args: - model_out: obs embeddings from the model layers, of shape - [BATCH_SIZE, num_outputs]. - actions (Optional[Tensor]): Actions to return the Q-values for. - Shape: [BATCH_SIZE, action_dim]. - - Returns: - tensor of shape [BATCH_SIZE]. - """ - return self.twin_q_model(torch.cat([model_out, actions], -1)) - - def get_policy_output(self, model_out: TensorType) -> TensorType: - """Return the action output for the most recent forward pass. - - This outputs the support for pi(s). For continuous action spaces, this - is the action directly. For discrete, is is the mean / std dev. - - Args: - model_out: obs embeddings from the model layers, of shape - [BATCH_SIZE, num_outputs]. - - Returns: - tensor of shape [BATCH_SIZE, action_out_size] - """ - return self.policy_model(model_out) - - def policy_variables( - self, as_dict: bool = False - ) -> Union[List[TensorType], Dict[str, TensorType]]: - """Return the list of variables for the policy net.""" - if as_dict: - return self.policy_model.state_dict() - return list(self.policy_model.parameters()) - - def q_variables( - self, as_dict=False - ) -> Union[List[TensorType], Dict[str, TensorType]]: - """Return the list of variables for Q / twin Q nets.""" - if as_dict: - return { - **self.q_model.state_dict(), - **(self.twin_q_model.state_dict() if self.twin_q_model else {}), - } - return list(self.q_model.parameters()) + ( - list(self.twin_q_model.parameters()) if self.twin_q_model else [] - ) diff --git a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_torch_policy.py b/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_torch_policy.py deleted file mode 100644 index 260ab760ce1ec..0000000000000 --- a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/ddpg_torch_policy.py +++ /dev/null @@ -1,350 +0,0 @@ -import logging -from typing import Any, Dict, List, Optional, Tuple, Type - -import gymnasium as gym -from rllib_ddpg.ddpg.utils import make_ddpg_models, validate_spaces - -import ray -from ray.rllib.algorithms.dqn.dqn_tf_policy import ( - PRIO_WEIGHTS, - postprocess_nstep_and_prio, -) -from ray.rllib.evaluation import Episode -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import ( - TorchDeterministic, - TorchDirichlet, - TorchDistributionWrapper, -) -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import TargetNetworkMixin -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.spaces.simplex import Simplex -from ray.rllib.utils.torch_utils import ( - apply_grad_clipping, - concat_multi_gpu_td_errors, - huber_loss, - l2_loss, -) -from ray.rllib.utils.typing import AlgorithmConfigDict, ModelGradients, TensorType - -torch, nn = try_import_torch() - -logger = logging.getLogger(__name__) - - -class ComputeTDErrorMixin: - def __init__(self: TorchPolicyV2): - def compute_td_error( - obs_t, act_t, rew_t, obs_tp1, terminateds_mask, importance_weights - ): - input_dict = self._lazy_tensor_dict( - SampleBatch( - { - SampleBatch.CUR_OBS: obs_t, - SampleBatch.ACTIONS: act_t, - SampleBatch.REWARDS: rew_t, - SampleBatch.NEXT_OBS: obs_tp1, - SampleBatch.TERMINATEDS: terminateds_mask, - PRIO_WEIGHTS: importance_weights, - } - ) - ) - # Do forward pass on loss to update td errors attribute - # (one TD-error value per item in batch to update PR weights). - self.loss(self.model, None, input_dict) - - # `self.model.td_error` is set within actor_critic_loss call. - return self.model.tower_stats["td_error"] - - self.compute_td_error = compute_td_error - - -class DDPGTorchPolicy(TargetNetworkMixin, ComputeTDErrorMixin, TorchPolicyV2): - def __init__( - self, - observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, - ): - config = dict(ray.rllib.algorithms.ddpg.ddpg.DDPGConfig().to_dict(), **config) - - # Create global step for counting the number of update operations. - self.global_step = 0 - - # Validate action space for DDPG - validate_spaces(self, observation_space, action_space) - - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - - ComputeTDErrorMixin.__init__(self) - - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() - - TargetNetworkMixin.__init__(self) - - @override(TorchPolicyV2) - def make_model_and_action_dist( - self, - ) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]: - model = make_ddpg_models(self) - if isinstance(self.action_space, Simplex): - distr_class = TorchDirichlet - else: - distr_class = TorchDeterministic - return model, distr_class - - @override(TorchPolicyV2) - def optimizer( - self, - ) -> List["torch.optim.Optimizer"]: - """Create separate optimizers for actor & critic losses.""" - - # Set epsilons to match tf.keras.optimizers.Adam's epsilon default. - self._actor_optimizer = torch.optim.Adam( - params=self.model.policy_variables(), lr=self.config["actor_lr"], eps=1e-7 - ) - - self._critic_optimizer = torch.optim.Adam( - params=self.model.q_variables(), lr=self.config["critic_lr"], eps=1e-7 - ) - - # Return them in the same order as the respective loss terms are returned. - return [self._actor_optimizer, self._critic_optimizer] - - @override(TorchPolicyV2) - def apply_gradients(self, gradients: ModelGradients) -> None: - # For policy gradient, update policy net one time v.s. - # update critic net `policy_delay` time(s). - if self.global_step % self.config["policy_delay"] == 0: - self._actor_optimizer.step() - - self._critic_optimizer.step() - - # Increment global step & apply ops. - self.global_step += 1 - - @override(TorchPolicyV2) - def action_distribution_fn( - self, - model: ModelV2, - *, - obs_batch: TensorType, - state_batches: TensorType, - is_training: bool = False, - **kwargs - ) -> Tuple[TensorType, type, List[TensorType]]: - model_out, _ = model( - SampleBatch(obs=obs_batch[SampleBatch.CUR_OBS], _is_training=is_training) - ) - dist_inputs = model.get_policy_output(model_out) - - if isinstance(self.action_space, Simplex): - distr_class = TorchDirichlet - else: - distr_class = TorchDeterministic - return dist_inputs, distr_class, [] # []=state out - - @override(TorchPolicyV2) - def postprocess_trajectory( - self, - sample_batch: SampleBatch, - other_agent_batches: Optional[Dict[Any, SampleBatch]] = None, - episode: Optional[Episode] = None, - ) -> SampleBatch: - return postprocess_nstep_and_prio( - self, sample_batch, other_agent_batches, episode - ) - - @override(TorchPolicyV2) - def loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> List[TensorType]: - target_model = self.target_models[model] - - twin_q = self.config["twin_q"] - gamma = self.config["gamma"] - n_step = self.config["n_step"] - use_huber = self.config["use_huber"] - huber_threshold = self.config["huber_threshold"] - l2_reg = self.config["l2_reg"] - - input_dict = SampleBatch( - obs=train_batch[SampleBatch.CUR_OBS], _is_training=True - ) - input_dict_next = SampleBatch( - obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True - ) - - model_out_t, _ = model(input_dict, [], None) - model_out_tp1, _ = model(input_dict_next, [], None) - target_model_out_tp1, _ = target_model(input_dict_next, [], None) - - # Policy network evaluation. - # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - policy_t = model.get_policy_output(model_out_t) - # policy_batchnorm_update_ops = list( - # set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) - - policy_tp1 = target_model.get_policy_output(target_model_out_tp1) - - # Action outputs. - if self.config["smooth_target_policy"]: - target_noise_clip = self.config["target_noise_clip"] - clipped_normal_sample = torch.clamp( - torch.normal( - mean=torch.zeros(policy_tp1.size()), std=self.config["target_noise"] - ).to(policy_tp1.device), - -target_noise_clip, - target_noise_clip, - ) - - policy_tp1_smoothed = torch.min( - torch.max( - policy_tp1 + clipped_normal_sample, - torch.tensor( - self.action_space.low, - dtype=torch.float32, - device=policy_tp1.device, - ), - ), - torch.tensor( - self.action_space.high, - dtype=torch.float32, - device=policy_tp1.device, - ), - ) - else: - # No smoothing, just use deterministic actions. - policy_tp1_smoothed = policy_tp1 - - # Q-net(s) evaluation. - # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - # Q-values for given actions & observations in given current - q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) - - # Q-values for current policy (no noise) in given current state - q_t_det_policy = model.get_q_values(model_out_t, policy_t) - - actor_loss = -torch.mean(q_t_det_policy) - - if twin_q: - twin_q_t = model.get_twin_q_values( - model_out_t, train_batch[SampleBatch.ACTIONS] - ) - # q_batchnorm_update_ops = list( - # set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) - - # Target q-net(s) evaluation. - q_tp1 = target_model.get_q_values(target_model_out_tp1, policy_tp1_smoothed) - - if twin_q: - twin_q_tp1 = target_model.get_twin_q_values( - target_model_out_tp1, policy_tp1_smoothed - ) - - q_t_selected = torch.squeeze(q_t, axis=len(q_t.shape) - 1) - if twin_q: - twin_q_t_selected = torch.squeeze(twin_q_t, axis=len(q_t.shape) - 1) - q_tp1 = torch.min(q_tp1, twin_q_tp1) - - q_tp1_best = torch.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) - q_tp1_best_masked = ( - 1.0 - train_batch[SampleBatch.TERMINATEDS].float() - ) * q_tp1_best - - # Compute RHS of bellman equation. - q_t_selected_target = ( - train_batch[SampleBatch.REWARDS] + gamma**n_step * q_tp1_best_masked - ).detach() - - # Compute the error (potentially clipped). - if twin_q: - td_error = q_t_selected - q_t_selected_target - twin_td_error = twin_q_t_selected - q_t_selected_target - if use_huber: - errors = huber_loss(td_error, huber_threshold) + huber_loss( - twin_td_error, huber_threshold - ) - else: - errors = 0.5 * ( - torch.pow(td_error, 2.0) + torch.pow(twin_td_error, 2.0) - ) - else: - td_error = q_t_selected - q_t_selected_target - if use_huber: - errors = huber_loss(td_error, huber_threshold) - else: - errors = 0.5 * torch.pow(td_error, 2.0) - - critic_loss = torch.mean(train_batch[PRIO_WEIGHTS] * errors) - - # Add l2-regularization if required. - if l2_reg is not None: - for name, var in model.policy_variables(as_dict=True).items(): - if "bias" not in name: - actor_loss += l2_reg * l2_loss(var) - for name, var in model.q_variables(as_dict=True).items(): - if "bias" not in name: - critic_loss += l2_reg * l2_loss(var) - - # Model self-supervised losses. - if self.config["use_state_preprocessor"]: - # Expand input_dict in case custom_loss' need them. - input_dict[SampleBatch.ACTIONS] = train_batch[SampleBatch.ACTIONS] - input_dict[SampleBatch.REWARDS] = train_batch[SampleBatch.REWARDS] - input_dict[SampleBatch.TERMINATEDS] = train_batch[SampleBatch.TERMINATEDS] - input_dict[SampleBatch.NEXT_OBS] = train_batch[SampleBatch.NEXT_OBS] - [actor_loss, critic_loss] = model.custom_loss( - [actor_loss, critic_loss], input_dict - ) - - # Store values for stats function in model (tower), such that for - # multi-GPU, we do not override them during the parallel loss phase. - model.tower_stats["q_t"] = q_t - model.tower_stats["actor_loss"] = actor_loss - model.tower_stats["critic_loss"] = critic_loss - # TD-error tensor in final stats - # will be concatenated and retrieved for each individual batch item. - model.tower_stats["td_error"] = td_error - - # Return two loss terms (corresponding to the two optimizers, we create). - return [actor_loss, critic_loss] - - @override(TorchPolicyV2) - def extra_grad_process( - self, optimizer: torch.optim.Optimizer, loss: TensorType - ) -> Dict[str, TensorType]: - # Clip grads if configured. - return apply_grad_clipping(self, optimizer, loss) - - @override(TorchPolicyV2) - def extra_compute_grad_fetches(self) -> Dict[str, Any]: - fetches = convert_to_numpy(concat_multi_gpu_td_errors(self)) - return dict({LEARNER_STATS_KEY: {}}, **fetches) - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - q_t = torch.stack(self.get_tower_stats("q_t")) - stats = { - "actor_loss": torch.mean(torch.stack(self.get_tower_stats("actor_loss"))), - "critic_loss": torch.mean(torch.stack(self.get_tower_stats("critic_loss"))), - "mean_q": torch.mean(q_t), - "max_q": torch.max(q_t), - "min_q": torch.min(q_t), - } - return convert_to_numpy(stats) diff --git a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/noop_model.py b/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/noop_model.py deleted file mode 100644 index 4dba83b9d4d49..0000000000000 --- a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/noop_model.py +++ /dev/null @@ -1,27 +0,0 @@ -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() - - -class NoopModel(TFModelV2): - """Trivial model that just returns the obs flattened. - - This is the model used if use_state_preprocessor=False.""" - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - return tf.cast(input_dict["obs_flat"], tf.float32), state - - -class TorchNoopModel(TorchModelV2): - """Trivial model that just returns the obs flattened. - - This is the model used if use_state_preprocessor=False.""" - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - return input_dict["obs_flat"].float(), state diff --git a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/utils.py b/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/utils.py deleted file mode 100644 index a2cdd6633a490..0000000000000 --- a/rllib_contrib/ddpg/src/rllib_ddpg/ddpg/utils.py +++ /dev/null @@ -1,85 +0,0 @@ -import gymnasium as gym -import numpy as np -from rllib_ddpg.ddpg.ddpg_tf_model import DDPGTFModel -from rllib_ddpg.ddpg.ddpg_torch_model import DDPGTorchModel -from rllib_ddpg.ddpg.noop_model import NoopModel, TorchNoopModel - -from ray.rllib import Policy -from ray.rllib.models import ModelV2 -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.utils.error import UnsupportedSpaceException - - -def make_ddpg_models(policy: Policy) -> ModelV2: - if policy.config["use_state_preprocessor"]: - default_model = None # catalog decides - num_outputs = 256 # arbitrary - policy.config["model"]["no_final_linear"] = True - else: - default_model = ( - TorchNoopModel if policy.config["framework"] == "torch" else NoopModel - ) - num_outputs = int(np.prod(policy.observation_space.shape)) - - model = ModelCatalog.get_model_v2( - obs_space=policy.observation_space, - action_space=policy.action_space, - num_outputs=num_outputs, - model_config=policy.config["model"], - framework=policy.config["framework"], - model_interface=( - DDPGTorchModel if policy.config["framework"] == "torch" else DDPGTFModel - ), - default_model=default_model, - name="ddpg_model", - actor_hidden_activation=policy.config["actor_hidden_activation"], - actor_hiddens=policy.config["actor_hiddens"], - critic_hidden_activation=policy.config["critic_hidden_activation"], - critic_hiddens=policy.config["critic_hiddens"], - twin_q=policy.config["twin_q"], - add_layer_norm=( - policy.config["exploration_config"].get("type") == "ParameterNoise" - ), - ) - - policy.target_model = ModelCatalog.get_model_v2( - obs_space=policy.observation_space, - action_space=policy.action_space, - num_outputs=num_outputs, - model_config=policy.config["model"], - framework=policy.config["framework"], - model_interface=( - DDPGTorchModel if policy.config["framework"] == "torch" else DDPGTFModel - ), - default_model=default_model, - name="target_ddpg_model", - actor_hidden_activation=policy.config["actor_hidden_activation"], - actor_hiddens=policy.config["actor_hiddens"], - critic_hidden_activation=policy.config["critic_hidden_activation"], - critic_hiddens=policy.config["critic_hiddens"], - twin_q=policy.config["twin_q"], - add_layer_norm=( - policy.config["exploration_config"].get("type") == "ParameterNoise" - ), - ) - - return model - - -def validate_spaces( - policy: Policy, - observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, -) -> None: - if not isinstance(action_space, gym.spaces.Box): - raise UnsupportedSpaceException( - "Action space ({}) of {} is not supported for " - "DDPG.".format(action_space, policy) - ) - elif len(action_space.shape) > 1: - raise UnsupportedSpaceException( - "Action space ({}) of {} has multiple dimensions " - "{}. ".format(action_space, policy, action_space.shape) - + "Consider reshaping this into a single dimension, " - "using a Tuple action space, or the multi-agent API." - ) diff --git a/rllib_contrib/ddpg/tests/test_ddpg.py b/rllib_contrib/ddpg/tests/test_ddpg.py deleted file mode 100644 index a3f86703e8f53..0000000000000 --- a/rllib_contrib/ddpg/tests/test_ddpg.py +++ /dev/null @@ -1,634 +0,0 @@ -import copy -import re -import unittest - -import gymnasium as gym -import numpy as np -from gymnasium.spaces import Box -from rllib_ddpg.ddpg import DDPGConfig - -import ray -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.numpy import ( - convert_to_numpy, - fc, - huber_loss, - l2_loss, - relu, - sigmoid, -) -from ray.rllib.utils.replay_buffers.utils import patch_buffer_with_fake_sampling_method -from ray.rllib.utils.spaces.simplex import Simplex -from ray.rllib.utils.test_utils import ( - check, - check_compute_single_action, - check_train_results, - framework_iterator, -) -from ray.rllib.utils.torch_utils import convert_to_torch_tensor - -tf1, tf, tfv = try_import_tf() -torch, _ = try_import_torch() - - -class SimpleEnv(gym.Env): - def __init__(self, config): - self._skip_env_checking = True - if config.get("simplex_actions", False): - self.action_space = Simplex((2,)) - else: - self.action_space = Box(0.0, 1.0, (1,)) - self.observation_space = Box(0.0, 1.0, (1,)) - self.max_steps = config.get("max_steps", 100) - self.state = None - self.steps = None - - def reset(self, *, seed=None, options=None): - self.state = self.observation_space.sample() - self.steps = 0 - return self.state, {} - - def step(self, action): - self.steps += 1 - # Reward is 1.0 - (max(actions) - state). - [rew] = 1.0 - np.abs(np.max(action) - self.state) - terminated = False - truncated = self.steps >= self.max_steps - self.state = self.observation_space.sample() - return self.state, rew, terminated, truncated, {} - - -class TestDDPG(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_ddpg_compilation(self): - """Test whether DDPG can be built with both frameworks.""" - config = ( - DDPGConfig() - .training(num_steps_sampled_before_learning_starts=0) - .rollouts(num_rollout_workers=0, num_envs_per_worker=2) - .exploration(exploration_config={"random_timesteps": 100}) - ) - - num_iterations = 1 - - # Test against all frameworks. - for _ in framework_iterator(config, with_eager_tracing=True): - algo = config.build(env="Pendulum-v1") - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - # Ensure apply_gradient_fn is being called and updating global_step - pol = algo.get_policy() - if config.framework_str == "tf": - a = pol.get_session().run(pol.global_step) - else: - a = pol.global_step - check(convert_to_numpy(a), 500) - algo.stop() - - def test_ddpg_exploration_and_with_random_prerun(self): - """Tests DDPG's Exploration (w/ random actions for n timesteps).""" - - core_config = ( - DDPGConfig() - .environment("Pendulum-v1") - .rollouts(num_rollout_workers=0) - .training(num_steps_sampled_before_learning_starts=0) - ) - - obs = np.array([0.0, 0.1, -0.1]) - - # Test against all frameworks. - for _ in framework_iterator(core_config): - config = copy.deepcopy(core_config) - # Default OUNoise setup. - algo = config.build() - # Setting explore=False should always return the same action. - a_ = algo.compute_single_action(obs, explore=False) - check(convert_to_numpy(algo.get_policy().global_timestep), 1) - for i in range(50): - a = algo.compute_single_action(obs, explore=False) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 2) - check(a, a_) - # explore=None (default: explore) should return different actions. - actions = [] - for i in range(50): - actions.append(algo.compute_single_action(obs)) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 52) - check(np.std(actions), 0.0, false=True) - algo.stop() - - # Check randomness at beginning. - config.exploration( - exploration_config={ - # Act randomly at beginning ... - "random_timesteps": 50, - # Then act very closely to deterministic actions thereafter. - "ou_base_scale": 0.001, - "initial_scale": 0.001, - "final_scale": 0.001, - } - ) - algo = config.build() - # ts=0 (get a deterministic action as per explore=False). - deterministic_action = algo.compute_single_action(obs, explore=False) - check(convert_to_numpy(algo.get_policy().global_timestep), 1) - # ts=1-49 (in random window). - random_a = [] - for i in range(1, 50): - random_a.append(algo.compute_single_action(obs, explore=True)) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 1) - check(random_a[-1], deterministic_action, false=True) - self.assertTrue(np.std(random_a) > 0.5) - - # ts > 50 (a=deterministic_action + scale * N[0,1]) - for i in range(50): - a = algo.compute_single_action(obs, explore=True) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 51) - check(a, deterministic_action, rtol=0.1) - - # ts >> 50 (BUT: explore=False -> expect deterministic action). - for i in range(50): - a = algo.compute_single_action(obs, explore=False) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 101) - check(a, deterministic_action) - algo.stop() - - def test_ddpg_loss_function(self): - """Tests DDPG loss function results across all frameworks.""" - config = DDPGConfig().training(num_steps_sampled_before_learning_starts=0) - - # Run locally. - config.seed = 42 - config.num_rollout_workers = 0 - config.twin_q = True - config.use_huber = True - config.huber_threshold = 1.0 - config.gamma = 0.99 - # Make this small (seems to introduce errors). - config.l2_reg = 1e-10 - config.replay_buffer_config = { - "type": "MultiAgentReplayBuffer", - "capacity": 50000, - } - config.num_steps_sampled_before_learning_starts = 0 - # Use very simple nets. - config.actor_hiddens = [10] - config.critic_hiddens = [10] - # Make sure, timing differences do not affect Algorithm.train(). - config.min_time_s_per_iteration = 0 - config.min_sample_timesteps_per_iteration = 100 - - map_ = { - # Normal net. - "default_policy/actor_hidden_0/kernel": "policy_model.action_0." - "_model.0.weight", - "default_policy/actor_hidden_0/bias": "policy_model.action_0." - "_model.0.bias", - "default_policy/actor_out/kernel": "policy_model.action_out." - "_model.0.weight", - "default_policy/actor_out/bias": "policy_model.action_out._model.0.bias", - "default_policy/sequential/q_hidden_0/kernel": "q_model.q_hidden_0" - "._model.0.weight", - "default_policy/sequential/q_hidden_0/bias": "q_model.q_hidden_0." - "_model.0.bias", - "default_policy/sequential/q_out/kernel": "q_model.q_out._model." - "0.weight", - "default_policy/sequential/q_out/bias": "q_model.q_out._model.0.bias", - # -- twin. - "default_policy/sequential_1/twin_q_hidden_0/kernel": "twin_" - "q_model.twin_q_hidden_0._model.0.weight", - "default_policy/sequential_1/twin_q_hidden_0/bias": "twin_" - "q_model.twin_q_hidden_0._model.0.bias", - "default_policy/sequential_1/twin_q_out/kernel": "twin_" - "q_model.twin_q_out._model.0.weight", - "default_policy/sequential_1/twin_q_out/bias": "twin_" - "q_model.twin_q_out._model.0.bias", - # Target net. - "default_policy/actor_hidden_0_1/kernel": "policy_model.action_0." - "_model.0.weight", - "default_policy/actor_hidden_0_1/bias": "policy_model.action_0." - "_model.0.bias", - "default_policy/actor_out_1/kernel": "policy_model.action_out." - "_model.0.weight", - "default_policy/actor_out_1/bias": "policy_model.action_out._model" - ".0.bias", - "default_policy/sequential_2/q_hidden_0/kernel": "q_model." - "q_hidden_0._model.0.weight", - "default_policy/sequential_2/q_hidden_0/bias": "q_model." - "q_hidden_0._model.0.bias", - "default_policy/sequential_2/q_out/kernel": "q_model." - "q_out._model.0.weight", - "default_policy/sequential_2/q_out/bias": "q_model.q_out._model.0.bias", - # -- twin. - "default_policy/sequential_3/twin_q_hidden_0/kernel": "twin_" - "q_model.twin_q_hidden_0._model.0.weight", - "default_policy/sequential_3/twin_q_hidden_0/bias": "twin_" - "q_model.twin_q_hidden_0._model.0.bias", - "default_policy/sequential_3/twin_q_out/kernel": "twin_" - "q_model.twin_q_out._model.0.weight", - "default_policy/sequential_3/twin_q_out/bias": "twin_" - "q_model.twin_q_out._model.0.bias", - } - - env = SimpleEnv - batch_size = 100 - obs_size = (batch_size, 1) - actions = np.random.random(size=(batch_size, 1)) - - # Batch of size=n. - input_ = self._get_batch_helper(obs_size, actions, batch_size) - - # Simply compare loss values AND grads of all frameworks with each - # other. - prev_fw_loss = weights_dict = None - expect_c, expect_a, expect_t = None, None, None - # History of tf-updated NN-weights over n training steps. - tf_updated_weights = [] - # History of input batches used. - tf_inputs = [] - for fw, sess in framework_iterator( - config, frameworks=("tf", "torch"), session=True - ): - # Generate Algorithm and get its default Policy object. - algo = config.build(env=env) - policy = algo.get_policy() - p_sess = None - if sess: - p_sess = policy.get_session() - - # Set all weights (of all nets) to fixed values. - if weights_dict is None: - assert fw == "tf" # Start with the tf vars-dict. - weights_dict_list = ( - policy.model.variables() + policy.target_model.variables() - ) - with p_sess.graph.as_default(): - collector = ray.experimental.tf_utils.TensorFlowVariables( - [], p_sess, weights_dict_list - ) - weights_dict = collector.get_weights() - else: - assert fw == "torch" # Then transfer that to torch Model. - model_dict = self._translate_weights_to_torch(weights_dict, map_) - policy.model.load_state_dict(model_dict) - policy.target_model.load_state_dict(model_dict) - - if fw == "torch": - # Actually convert to torch tensors. - input_ = policy._lazy_tensor_dict(input_) - input_ = {k: input_[k] for k in input_.keys()} - - # Only run the expectation once, should be the same anyways - # for all frameworks. - if expect_c is None: - expect_c, expect_a, expect_t = self._ddpg_loss_helper( - input_, - weights_dict, - sorted(weights_dict.keys()), - fw, - gamma=config.gamma, - huber_threshold=config.huber_threshold, - l2_reg=config.l2_reg, - sess=sess, - ) - - # Get actual outs and compare to expectation AND previous - # framework. c=critic, a=actor, e=entropy, t=td-error. - if fw == "tf": - c, a, t, tf_c_grads, tf_a_grads = p_sess.run( - [ - policy.critic_loss, - policy.actor_loss, - policy.td_error, - policy._critic_optimizer.compute_gradients( - policy.critic_loss, policy.model.q_variables() - ), - policy._actor_optimizer.compute_gradients( - policy.actor_loss, policy.model.policy_variables() - ), - ], - feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False), - ) - # Check pure loss values. - check(c, expect_c) - check(a, expect_a) - check(t, expect_t) - - tf_c_grads = [g for g, v in tf_c_grads] - tf_a_grads = [g for g, v in tf_a_grads] - - elif fw == "torch": - policy.loss(policy.model, None, input_) - c, a, t = ( - policy.get_tower_stats("critic_loss")[0], - policy.get_tower_stats("actor_loss")[0], - policy.get_tower_stats("td_error")[0], - ) - # Check pure loss values. - check(c, expect_c) - check(a, expect_a) - check(t, expect_t) - - # Test actor gradients. - policy._actor_optimizer.zero_grad() - assert all(v.grad is None for v in policy.model.q_variables()) - assert all(v.grad is None for v in policy.model.policy_variables()) - a.backward() - # `actor_loss` depends on Q-net vars - # (but not twin-Q-net vars!). - assert not any(v.grad is None for v in policy.model.q_variables()[:4]) - assert all(v.grad is None for v in policy.model.q_variables()[4:]) - assert not all( - torch.mean(v.grad) == 0 for v in policy.model.policy_variables() - ) - assert not all( - torch.min(v.grad) == 0 for v in policy.model.policy_variables() - ) - # Compare with tf ones. - torch_a_grads = [v.grad for v in policy.model.policy_variables()] - for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): - if tf_g.shape != torch_g.shape: - check(tf_g, np.transpose(torch_g.cpu())) - else: - check(tf_g, torch_g) - - # Test critic gradients. - policy._critic_optimizer.zero_grad() - assert all( - v.grad is None or torch.mean(v.grad) == 0.0 - for v in policy.model.q_variables() - ) - assert all( - v.grad is None or torch.min(v.grad) == 0.0 - for v in policy.model.q_variables() - ) - c.backward() - assert not all( - torch.mean(v.grad) == 0 for v in policy.model.q_variables() - ) - assert not all( - torch.min(v.grad) == 0 for v in policy.model.q_variables() - ) - # Compare with tf ones. - torch_c_grads = [v.grad for v in policy.model.q_variables()] - for tf_g, torch_g in zip(tf_c_grads, torch_c_grads): - if tf_g.shape != torch_g.shape: - check(tf_g, np.transpose(torch_g.cpu())) - else: - check(tf_g, torch_g) - # Compare (unchanged(!) actor grads) with tf ones. - torch_a_grads = [v.grad for v in policy.model.policy_variables()] - for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): - if tf_g.shape != torch_g.shape: - check(tf_g, np.transpose(torch_g.cpu())) - else: - check(tf_g, torch_g) - - # Store this framework's losses in prev_fw_loss to compare with - # next framework's outputs. - if prev_fw_loss is not None: - check(c, prev_fw_loss[0]) - check(a, prev_fw_loss[1]) - check(t, prev_fw_loss[2]) - - prev_fw_loss = (c, a, t) - - # Update weights from our batch (n times). - for update_iteration in range(6): - print("train iteration {}".format(update_iteration)) - if fw == "tf": - in_ = self._get_batch_helper(obs_size, actions, batch_size) - tf_inputs.append(in_) - # Set a fake-batch to use - # (instead of sampling from replay buffer). - buf = algo.local_replay_buffer - patch_buffer_with_fake_sampling_method(buf, in_) - algo.train() - updated_weights = policy.get_weights() - # Net must have changed. - if tf_updated_weights: - check( - updated_weights["default_policy/actor_hidden_0/kernel"], - tf_updated_weights[-1][ - "default_policy/actor_hidden_0/kernel" - ], - false=True, - ) - tf_updated_weights.append(updated_weights) - - # Compare with updated tf-weights. Must all be the same. - else: - tf_weights = tf_updated_weights[update_iteration] - in_ = tf_inputs[update_iteration] - # Set a fake-batch to use - # (instead of sampling from replay buffer). - buf = algo.local_replay_buffer - patch_buffer_with_fake_sampling_method(buf, in_) - algo.train() - # Compare updated model and target weights. - for tf_key in tf_weights.keys(): - tf_var = tf_weights[tf_key] - # Model. - if re.search( - "actor_out_1|actor_hidden_0_1|sequential_[23]", tf_key - ): - torch_var = policy.target_model.state_dict()[map_[tf_key]] - # Target model. - else: - torch_var = policy.model.state_dict()[map_[tf_key]] - if tf_var.shape != torch_var.shape: - check(tf_var, np.transpose(torch_var.cpu()), atol=0.1) - else: - check(tf_var, torch_var, atol=0.1) - - algo.stop() - - def _get_batch_helper(self, obs_size, actions, batch_size): - return SampleBatch( - { - SampleBatch.CUR_OBS: np.random.random(size=obs_size), - SampleBatch.ACTIONS: actions, - SampleBatch.REWARDS: np.random.random(size=(batch_size,)), - SampleBatch.TERMINATEDS: np.random.choice( - [True, False], size=(batch_size,) - ), - SampleBatch.NEXT_OBS: np.random.random(size=obs_size), - "weights": np.ones(shape=(batch_size,)), - } - ) - - def _ddpg_loss_helper( - self, train_batch, weights, ks, fw, gamma, huber_threshold, l2_reg, sess - ): - """Emulates DDPG loss functions for tf and torch.""" - model_out_t = train_batch[SampleBatch.CUR_OBS] - target_model_out_tp1 = train_batch[SampleBatch.NEXT_OBS] - # get_policy_output - policy_t = sigmoid( - 2.0 - * fc( - relu(fc(model_out_t, weights[ks[1]], weights[ks[0]], framework=fw)), - weights[ks[5]], - weights[ks[4]], - framework=fw, - ) - ) - # Get policy output for t+1 (target model). - policy_tp1 = sigmoid( - 2.0 - * fc( - relu( - fc( - target_model_out_tp1, - weights[ks[3]], - weights[ks[2]], - framework=fw, - ) - ), - weights[ks[7]], - weights[ks[6]], - framework=fw, - ) - ) - # Assume no smooth target policy. - policy_tp1_smoothed = policy_tp1 - - # Q-values for the actually selected actions. - # get_q_values - q_t = fc( - relu( - fc( - np.concatenate([model_out_t, train_batch[SampleBatch.ACTIONS]], -1), - weights[ks[9]], - weights[ks[8]], - framework=fw, - ) - ), - weights[ks[11]], - weights[ks[10]], - framework=fw, - ) - twin_q_t = fc( - relu( - fc( - np.concatenate([model_out_t, train_batch[SampleBatch.ACTIONS]], -1), - weights[ks[13]], - weights[ks[12]], - framework=fw, - ) - ), - weights[ks[15]], - weights[ks[14]], - framework=fw, - ) - - # Q-values for current policy in given current state. - # get_q_values - q_t_det_policy = fc( - relu( - fc( - np.concatenate([model_out_t, policy_t], -1), - weights[ks[9]], - weights[ks[8]], - framework=fw, - ) - ), - weights[ks[11]], - weights[ks[10]], - framework=fw, - ) - - # Target q network evaluation. - # target_model.get_q_values - q_tp1 = fc( - relu( - fc( - np.concatenate([target_model_out_tp1, policy_tp1_smoothed], -1), - weights[ks[17]], - weights[ks[16]], - framework=fw, - ) - ), - weights[ks[19]], - weights[ks[18]], - framework=fw, - ) - twin_q_tp1 = fc( - relu( - fc( - np.concatenate([target_model_out_tp1, policy_tp1_smoothed], -1), - weights[ks[21]], - weights[ks[20]], - framework=fw, - ) - ), - weights[ks[23]], - weights[ks[22]], - framework=fw, - ) - - q_t_selected = np.squeeze(q_t, axis=-1) - twin_q_t_selected = np.squeeze(twin_q_t, axis=-1) - q_tp1 = np.minimum(q_tp1, twin_q_tp1) - q_tp1_best = np.squeeze(q_tp1, axis=-1) - - dones = train_batch[SampleBatch.TERMINATEDS] - rewards = train_batch[SampleBatch.REWARDS] - if fw == "torch": - dones = dones.float().numpy() - rewards = rewards.numpy() - - q_tp1_best_masked = (1.0 - dones) * q_tp1_best - q_t_selected_target = rewards + gamma * q_tp1_best_masked - - td_error = q_t_selected - q_t_selected_target - twin_td_error = twin_q_t_selected - q_t_selected_target - errors = huber_loss(td_error, huber_threshold) + huber_loss( - twin_td_error, huber_threshold - ) - - critic_loss = np.mean(errors) - actor_loss = -np.mean(q_t_det_policy) - # Add l2-regularization if required. - for name, var in weights.items(): - if re.match("default_policy/actor_(hidden_0|out)/kernel", name): - actor_loss += l2_reg * l2_loss(var) - elif re.match("default_policy/sequential(_1)?/\\w+/kernel", name): - critic_loss += l2_reg * l2_loss(var) - - return critic_loss, actor_loss, td_error - - def _translate_weights_to_torch(self, weights_dict, map_): - model_dict = { - map_[k]: convert_to_torch_tensor( - np.transpose(v) if re.search("kernel", k) else v - ) - for k, v in weights_dict.items() - if re.search("default_policy/(actor_(hidden_0|out)|sequential(_1)?)/", k) - } - model_dict[ - "policy_model.action_out_squashed.low_action" - ] = convert_to_torch_tensor(np.array([0.0])) - model_dict[ - "policy_model.action_out_squashed.action_range" - ] = convert_to_torch_tensor(np.array([1.0])) - return model_dict - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/ddpg/tuned_examples/__init__.py b/rllib_contrib/ddpg/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/ddpg/tuned_examples/halfcheetah-ddpg.yaml b/rllib_contrib/ddpg/tuned_examples/halfcheetah-ddpg.yaml deleted file mode 100644 index 51eca6a778131..0000000000000 --- a/rllib_contrib/ddpg/tuned_examples/halfcheetah-ddpg.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# This configuration can expect to reach 2000 reward in 150k-200k timesteps -halfcheetah-ddpg: - env: HalfCheetah-v2 - run: DDPG - stop: - sampler_results/episode_reward_mean: 2000 - time_total_s: 5400 # 90 minutes - config: - # Works for both torch and tf. - framework: torch - # === Model === - actor_hiddens: [64, 64] - critic_hiddens: [64, 64] - n_step: 1 - model: {} - gamma: 0.99 - env_config: {} - - # === Exploration === - exploration_config: - initial_scale: 1.0 - final_scale: 0.02 - scale_timesteps: 10000 - ou_base_scale: 0.1 - ou_theta: 0.15 - ou_sigma: 0.2 - - min_sample_timesteps_per_iteration: 1000 - target_network_update_freq: 0 - tau: 0.001 - - # === Replay buffer === - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - capacity: 10000 - prioritized_replay_alpha: 0.6 - prioritized_replay_beta: 0.4 - prioritized_replay_eps: 0.000001 - worker_side_prioritization: false - - num_steps_sampled_before_learning_starts: 500 - clip_rewards: False - - # === Optimization === - actor_lr: 0.001 - critic_lr: 0.001 - use_huber: false - huber_threshold: 1.0 - l2_reg: 0.000001 - rollout_fragment_length: 1 - train_batch_size: 64 - - # === Parallelism === - num_workers: 0 - num_gpus_per_worker: 0 - - # === Evaluation === - evaluation_interval: 5 - evaluation_duration: 10 diff --git a/rllib_contrib/ddpg/tuned_examples/halfcheetah-pybullet-ddpg.yaml b/rllib_contrib/ddpg/tuned_examples/halfcheetah-pybullet-ddpg.yaml deleted file mode 100644 index af2c10a71eda2..0000000000000 --- a/rllib_contrib/ddpg/tuned_examples/halfcheetah-pybullet-ddpg.yaml +++ /dev/null @@ -1,46 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# Note: HalfCheetahBulletEnv-v0 is not the same as MuJoCo's HalfCheetah-v0. -ddpg-halfcheetahbulletenv-v0: - env: HalfCheetahBulletEnv-v0 - run: DDPG - stop: - sampler_results/episode_reward_mean: -300.0 - timesteps_total: 200000 - config: - actor_hiddens: [256, 256] - critic_hiddens: [256, 256] - n_step: 3 - model: {} - gamma: 0.99 - env_config: {} - exploration_config: - initial_scale: 1.0 - final_scale: 0.02 - scale_timesteps: 10000 - ou_base_scale: 0.1 - ou_theta: 0.15 - ou_sigma: 0.2 - min_sample_timesteps_per_iteration: 1000 - target_network_update_freq: 0 - tau: 0.001 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - capacity: 15000 - prioritized_replay_alpha: 0.6 - prioritized_replay_beta: 0.4 - prioritized_replay_eps: 0.000001 - worker_side_prioritization: false - num_steps_sampled_before_learning_starts: 500 - clip_rewards: false - actor_lr: 0.001 - critic_lr: 0.001 - use_huber: true - huber_threshold: 1.0 - l2_reg: 0.000001 - rollout_fragment_length: 1 - train_batch_size: 48 - num_workers: 0 - num_gpus: 1 - num_gpus_per_worker: 0 diff --git a/rllib_contrib/ddpg/tuned_examples/hopper-pybullet-ddpg.yaml b/rllib_contrib/ddpg/tuned_examples/hopper-pybullet-ddpg.yaml deleted file mode 100644 index e28b9cad8e2b0..0000000000000 --- a/rllib_contrib/ddpg/tuned_examples/hopper-pybullet-ddpg.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# Note: HopperBulletEnv-v0 is not the same as MuJoCo's Hopper-v0. -ddpg-hopperbulletenv-v0: - env: HopperBulletEnv-v0 - run: DDPG - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 120.0 - timesteps_total: 50000 - stop: - time_total_s: 2000 - config: - actor_hiddens: [256, 256] - critic_hiddens: [256, 256] - n_step: 3 - model: {} - gamma: 0.99 - env_config: {} - exploration_config: - initial_scale: 1.0 - final_scale: 0.02 - scale_timesteps: 10000 - ou_base_scale: 0.1 - ou_theta: 0.15 - ou_sigma: 0.2 - min_sample_timesteps_per_iteration: 1000 - target_network_update_freq: 0 - tau: 0.001 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - capacity: 10000 - prioritized_replay_alpha: 0.6 - prioritized_replay_beta: 0.4 - prioritized_replay_eps: 0.000001 - worker_side_prioritization: false - num_steps_sampled_before_learning_starts: 500 - clip_rewards: False - actor_lr: 0.001 - critic_lr: 0.001 - use_huber: False - huber_threshold: 1.0 - l2_reg: 0.000001 - rollout_fragment_length: 1 - train_batch_size: 48 - num_workers: 0 - num_gpus_per_worker: 0 \ No newline at end of file diff --git a/rllib_contrib/ddpg/tuned_examples/memory-leak-test-ddpg.yaml b/rllib_contrib/ddpg/tuned_examples/memory-leak-test-ddpg.yaml deleted file mode 100644 index 1b70de3771c7f..0000000000000 --- a/rllib_contrib/ddpg/tuned_examples/memory-leak-test-ddpg.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -memory-leak-test-ddpg: - env: - ray.rllib.examples.env.random_env.RandomLargeObsSpaceEnvContActions - run: DDPG - config: - # Works for both torch and tf. - framework: torch - # Switch off np.random, which is known to have memory leaks. - env_config: - config: - static_samples: true - replay_buffer_config: - capacity: 500 # use small buffer to catch memory leaks diff --git a/rllib_contrib/ddpg/tuned_examples/mountaincarcontinuous-ddpg.yaml b/rllib_contrib/ddpg/tuned_examples/mountaincarcontinuous-ddpg.yaml deleted file mode 100644 index 9364fe7118151..0000000000000 --- a/rllib_contrib/ddpg/tuned_examples/mountaincarcontinuous-ddpg.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# This configuration can expect to reach 90 reward in 10k-20k timesteps -mountaincarcontinuous-ddpg: - env: MountainCarContinuous-v0 - run: DDPG - stop: - sampler_results/episode_reward_mean: 90 - time_total_s: 600 # 10 minutes - config: - # Works for both torch and tf. - framework: torch - # === Model === - actor_hiddens: [32, 64] - critic_hiddens: [64, 64] - n_step: 3 - model: {} - gamma: 0.99 - env_config: {} - - # === Exploration === - exploration_config: - initial_scale: 1.0 - final_scale: 0.02 - scale_timesteps: 40000 - ou_base_scale: 0.75 - ou_theta: 0.15 - ou_sigma: 0.2 - - min_sample_timesteps_per_iteration: 1000 - - target_network_update_freq: 0 - tau: 0.01 - - # === Replay buffer === - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - capacity: 50000 - prioritized_replay_alpha: 0.6 - prioritized_replay_beta: 0.4 - prioritized_replay_eps: 0.000001 - worker_side_prioritization: false - num_steps_sampled_before_learning_starts: 1000 - clip_rewards: False - - # === Optimization === - actor_lr: 0.001 - critic_lr: 0.001 - use_huber: False - huber_threshold: 1.0 - l2_reg: 0.00001 - rollout_fragment_length: 1 - train_batch_size: 64 - - # === Parallelism === - num_workers: 0 - num_gpus_per_worker: 0 - - # === Evaluation === - evaluation_interval: 5 - evaluation_duration: 10 diff --git a/rllib_contrib/ddpg/tuned_examples/pendulum-ddpg-fake-gpus.yaml b/rllib_contrib/ddpg/tuned_examples/pendulum-ddpg-fake-gpus.yaml deleted file mode 100644 index 9b4ffa119d488..0000000000000 --- a/rllib_contrib/ddpg/tuned_examples/pendulum-ddpg-fake-gpus.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pendulum-ddpg-fake-gpus: - env: Pendulum-v1 - run: DDPG - stop: - sampler_results/episode_reward_mean: -1000 - timesteps_total: 40000 - config: - # Works for both torch and tf. - seed: 42 - framework: torch - actor_hiddens: [64, 64] - critic_hiddens: [64, 64] - n_step: 1 - model: {} - gamma: 0.99 - exploration_config: - final_scale: 0.02 - min_sample_timesteps_per_iteration: 600 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - capacity: 10000 - worker_side_prioritization: false - num_steps_sampled_before_learning_starts: 500 - clip_rewards: false - use_huber: true - train_batch_size: 64 - num_workers: 0 - actor_lr: 0.0001 - critic_lr: 0.0001 - - # Fake 2 GPUs. - num_gpus: 2 - _fake_gpus: true diff --git a/rllib_contrib/ddpg/tuned_examples/pendulum-ddpg.yaml b/rllib_contrib/ddpg/tuned_examples/pendulum-ddpg.yaml deleted file mode 100644 index a07ea34641803..0000000000000 --- a/rllib_contrib/ddpg/tuned_examples/pendulum-ddpg.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# This configuration can expect to reach -160 reward in 10k-20k timesteps. -pendulum-ddpg: - env: Pendulum-v1 - run: DDPG - stop: - sampler_results/episode_reward_mean: -320 - timesteps_total: 30000 - config: - # Works for both torch and tf. - seed: 42 - framework: torch - # === Model === - actor_hiddens: [64, 64] - critic_hiddens: [64, 64] - n_step: 1 - model: {} - gamma: 0.99 - - # === Exploration === - exploration_config: - type: "OrnsteinUhlenbeckNoise" - scale_timesteps: 10000 - initial_scale: 1.0 - final_scale: 0.02 - ou_base_scale: 0.1 - ou_theta: 0.15 - ou_sigma: 0.2 - - min_sample_timesteps_per_iteration: 600 - target_network_update_freq: 0 - tau: 0.001 - - # === Replay buffer === - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - capacity: 10000 - worker_side_prioritization: false - num_steps_sampled_before_learning_starts: 500 - clip_rewards: False - - # === Optimization === - actor_lr: 0.001 - critic_lr: 0.001 - use_huber: True - huber_threshold: 1.0 - l2_reg: 0.000001 - rollout_fragment_length: 1 - train_batch_size: 64 - - # === Parallelism === - num_workers: 0 diff --git a/rllib_contrib/ddppo/BUILD b/rllib_contrib/ddppo/BUILD deleted file mode 100644 index 1cb5d65401428..0000000000000 --- a/rllib_contrib/ddppo/BUILD +++ /dev/null @@ -1,42 +0,0 @@ -# Examples - -py_test( - name = "example_ddppo_pendulum_v1", - main = "ddppo_pendulum_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/ddppo_pendulum_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_cartpole_ddppo", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], - size = "small", - srcs = ["run_regression_tests.py"], - data = glob(["tuned_examples/cartpole-ddppo.yaml"]), - args = ["--dir=ddppo/tuned_examples/"] -) - -py_test( - name = "learning_tests_pendulum_ddppo", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], - size = "enormous", - srcs = ["run_regression_tests.py"], - data = glob(["tuned_examples/pendulum-ddppo.yaml"]), - args = ["--dir=ddppo/tuned_examples/"] -) - - -# Compilation Tests - -py_test( - name = "test_ddppo", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_ddppo.py"] -) diff --git a/rllib_contrib/ddppo/README.md b/rllib_contrib/ddppo/README.md deleted file mode 100644 index 5b01f686dd8ff..0000000000000 --- a/rllib_contrib/ddppo/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# DDPPO (Decentralized Distributed Proximal Policy Optimization) - -[DDPPO](https://arxiv.org/abs/1911.00357) is a method for distributed reinforcement learning in resource-intensive simulated environments based on PPO. DD-PPO is distributed (uses multiple machines), decentralized (lacks a centralized server), and synchronous (no computation is ever stale), making it conceptually simple and easy to implement. - - -## Installation - -``` -conda create -n rllib-ddppo python=3.10 -conda activate rllib-ddppo -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[DDPPO Example]() \ No newline at end of file diff --git a/rllib_contrib/ddppo/examples/ddppo_pendulum_v1.py b/rllib_contrib/ddppo/examples/ddppo_pendulum_v1.py deleted file mode 100644 index 0423e4d5fbb41..0000000000000 --- a/rllib_contrib/ddppo/examples/ddppo_pendulum_v1.py +++ /dev/null @@ -1,62 +0,0 @@ -import argparse - -from rllib_ddppo.ddppo import DDPPO, DDPPOConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - DDPPOConfig() - .rollouts( - num_rollout_workers=4, - num_envs_per_worker=10, - observation_filter="MeanStdFilter", - ) - .environment("Pendulum-v1") - .training( - train_batch_size=2500, - gamma=0.95, - sgd_minibatch_size=50, - num_sgd_iter=5, - clip_param=0.4, - vf_clip_param=10.0, - lambda_=0.1, - lr=0.00015, - ) - .resources(num_gpus_per_worker=0) - .reporting(min_sample_timesteps_per_iteration=1000, min_time_s_per_iteration=5) - ) - - stop_reward = -700 - - tuner = tune.Tuner( - DDPPO, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 1500000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/ddppo/pyproject.toml b/rllib_contrib/ddppo/pyproject.toml deleted file mode 100644 index b79c89da2adc9..0000000000000 --- a/rllib_contrib/ddppo/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-ddppo" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "torch==1.12.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "numpy<2"] diff --git a/rllib_contrib/ddppo/requirements.txt b/rllib_contrib/ddppo/requirements.txt deleted file mode 100644 index b07006a1b4ec6..0000000000000 --- a/rllib_contrib/ddppo/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/ddppo/src/rllib_ddppo/ddppo/__init__.py b/rllib_contrib/ddppo/src/rllib_ddppo/ddppo/__init__.py deleted file mode 100644 index 1cf190189472c..0000000000000 --- a/rllib_contrib/ddppo/src/rllib_ddppo/ddppo/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from rllib_ddppo.ddppo.ddppo import DDPPO, DDPPOConfig - -from ray.tune.registry import register_trainable - -__all__ = ["DDPPOConfig", "DDPPO"] - -register_trainable("rllib-contrib-ddppo", DDPPO) diff --git a/rllib_contrib/ddppo/src/rllib_ddppo/ddppo/ddppo.py b/rllib_contrib/ddppo/src/rllib_ddppo/ddppo/ddppo.py deleted file mode 100644 index a039995b49141..0000000000000 --- a/rllib_contrib/ddppo/src/rllib_ddppo/ddppo/ddppo.py +++ /dev/null @@ -1,369 +0,0 @@ -""" -Decentralized Distributed PPO (DD-PPO) -====================================== - -Unlike APPO or PPO, learning is no longer done centralized in the trainer -process. Instead, gradients are computed remotely on each rollout worker and -all-reduced to sync them at each mini-batch. This allows each worker's GPU -to be used both for sampling and for training. - -DD-PPO should be used if you have envs that require GPUs to function, or have -a very large model that cannot be effectively optimized with the GPUs available -on a single machine (DD-PPO allows scaling to arbitrary numbers of GPUs across -multiple nodes, unlike PPO/APPO which is limited to GPUs on a single node). - -Paper reference: https://arxiv.org/abs/1911.00357 -Note that unlike the paper, we currently do not implement straggler mitigation. -""" - -import logging -import time -from typing import Optional - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.ppo import PPO, PPOConfig -from ray.rllib.evaluation.postprocessing import Postprocessing -from ray.rllib.evaluation.rollout_worker import RolloutWorker -from ray.rllib.utils.annotations import override -from ray.rllib.utils.metrics import ( - LEARN_ON_BATCH_TIMER, - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, - SAMPLE_TIMER, -) -from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder -from ray.rllib.utils.sgd import do_minibatch_sgd -from ray.rllib.utils.typing import ResultDict - -logger = logging.getLogger(__name__) - - -class DDPPOConfig(PPOConfig): - """Defines a configuration class from which a DDPPO Algorithm can be built. - - Note(jungong) : despite best efforts, DDPPO does not use fault tolerant and - elastic features of WorkerSet, because of the way Torch DDP is set up. - - Example: - >>> from ray.rllib.algorithms.ddppo import DDPPOConfig - >>> config = DDPPOConfig().training(lr=0.003, keep_local_weights_in_sync=True) - >>> config = config.resources(num_gpus=1) - >>> config = config.rollouts(num_rollout_workers=10) - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.ddppo import DDPPOConfig - >>> from ray import air - >>> from ray import tune - >>> config = DDPPOConfig() - >>> # Print out some default values. - >>> print(config.kl_coeff) # doctest: +SKIP - >>> # Update the config object. - >>> config.training( # doctest: +SKIP - ... lr=tune.grid_search([0.001, 0.0001]), num_sgd_iter=15) - >>> # Set the config object's env. - >>> config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "DDPPO", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a DDPPOConfig instance.""" - super().__init__(algo_class=algo_class or DDPPO) - - # fmt: off - # __sphinx_doc_begin__ - # DD-PPO specific settings: - self.keep_local_weights_in_sync = True - self.torch_distributed_backend = "gloo" - - # Override some of PPO/Algorithm's default values with DDPPO-specific values. - self.num_rollout_workers = 2 - # Vectorize the env (should enable by default since each worker has - # a GPU). - self.num_envs_per_worker = 5 - # During the SGD phase, workers iterate over minibatches of this size. - # The effective minibatch size will be: - # `sgd_minibatch_size * num_workers`. - self.sgd_minibatch_size = 50 - # Number of SGD epochs per optimization round. - self.num_sgd_iter = 10 - - # *** WARNING: configs below are DDPPO overrides over PPO; you - # shouldn't need to adjust them. *** - # DDPPO requires PyTorch distributed. - self.framework_str = "torch" - # Learning is no longer done on the driver process, so - # giving GPUs to the driver does not make sense! - self.num_gpus = 0 - # Each rollout worker gets a GPU. - self.num_gpus_per_worker = 1 - # Note: This is the train_batch_size per worker (updates happen on worker - # side). `rollout_fragment_length` (if "auto") is computed as: - # `train_batch_size` // `num_envs_per_worker`. - self.train_batch_size = 500 - # Kl divergence penalty should be fixed to 0 in DDPPO because in order - # for it to be used as a penalty, we would have to un-decentralize - # DDPPO - self.kl_coeff = 0.0 - self.kl_target = 0.0 - # TODO (Kourosh) RLModule and Learner API is not supported yet - self._enable_learner_api = False - self._enable_rl_module_api = False - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # __sphinx_doc_end__ - # fmt: on - - @override(PPOConfig) - def training( - self, - *, - keep_local_weights_in_sync: Optional[bool] = NotProvided, - torch_distributed_backend: Optional[str] = NotProvided, - **kwargs, - ) -> "DDPPOConfig": - """Sets the training related configuration. - - Args: - keep_local_weights_in_sync: Download weights between each training step. - This adds a bit of overhead but allows the user to access the weights - from the trainer. - torch_distributed_backend: The communication backend for PyTorch - distributed. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if keep_local_weights_in_sync is not NotProvided: - self.keep_local_weights_in_sync = keep_local_weights_in_sync - if torch_distributed_backend is not NotProvided: - self.torch_distributed_backend = torch_distributed_backend - - return self - - @override(PPOConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - # Must have `num_rollout_workers` >= 1. - if self.num_rollout_workers < 1: - raise ValueError( - "Due to its distributed, decentralized nature, " - "DD-PPO requires `num_workers` to be >= 1!" - ) - - # Only supported for PyTorch so far. - if self.framework_str != "torch": - raise ValueError("Distributed data parallel is only supported for PyTorch") - if self.torch_distributed_backend not in ("gloo", "mpi", "nccl"): - raise ValueError( - "Only gloo, mpi, or nccl is supported for " - "the backend of PyTorch distributed." - ) - # `num_gpus` must be 0/None, since all optimization happens on Workers. - if self.num_gpus: - raise ValueError( - "When using distributed data parallel, you should set " - "num_gpus=0 since all optimization " - "is happening on workers. Enable GPUs for workers by setting " - "num_gpus_per_worker=1." - ) - # `batch_mode` must be "truncate_episodes". - if not self.in_evaluation and self.batch_mode != "truncate_episodes": - raise ValueError( - "Distributed data parallel requires truncate_episodes batch mode." - ) - - # DDPPO doesn't support KL penalties like PPO-1. - # In order to support KL penalties, DDPPO would need to become - # undecentralized, which defeats the purpose of the algorithm. - # Users can still tune the entropy coefficient to control the - # policy entropy (similar to controlling the KL penalty). - if self.kl_coeff != 0.0 or self.kl_target != 0.0: - raise ValueError( - "Invalid zero-values for `kl_coeff` and/or `kl_target`! " - "DDPPO doesn't support KL penalties like PPO-1!" - ) - - @override(AlgorithmConfig) - def get_rollout_fragment_length(self, worker_index: int = 0) -> int: - if self.rollout_fragment_length == "auto": - # Example: - # 2 workers (ignored as learning happens on workers), - # 2 envs per worker, 100 train batch size: - # -> 100 / 2 -> 50 - # 4 workers (ignored), 3 envs per worker, 1500 train batch size: - # -> 1500 / 3 -> 500 - rollout_fragment_length = self.train_batch_size // ( - self.num_envs_per_worker - ) - return rollout_fragment_length - else: - return self.rollout_fragment_length - - -class DDPPO(PPO): - @classmethod - @override(PPO) - def get_default_config(cls) -> AlgorithmConfig: - return DDPPOConfig() - - @override(PPO) - def setup(self, config: AlgorithmConfig): - super().setup(config) - - # Initialize torch process group for - self._curr_learner_info = {} - - worker_ids = self.workers.healthy_worker_ids() - assert worker_ids, "No healthy rollout workers." - - # Find IP and Port of the first remote worker. This is our Rank 0 worker. - ip = self.workers.foreach_worker( - func=lambda w: w.get_node_ip(), - remote_worker_ids=[worker_ids[0]], - local_worker=False, - )[0] - port = self.workers.foreach_worker( - func=lambda w: w.find_free_port(), - remote_worker_ids=[worker_ids[0]], - local_worker=False, - )[0] - address = "tcp://{ip}:{port}".format(ip=ip, port=port) - logger.info("Creating torch process group with leader {}".format(address)) - - # Get setup tasks in order to throw errors on failure. - world_size = self.workers.num_remote_workers() - backend = self.config.torch_distributed_backend - - def get_setup_fn(world_rank): - return lambda w: w.setup_torch_data_parallel( - url=address, - world_rank=world_rank, - world_size=world_size, - backend=backend, - ) - - funcs = [get_setup_fn(i) for i in range(world_size)] - # Set up torch distributed on all workers. The assumption here is that - # all workers should be healthy at this point. - self.workers.foreach_worker(func=funcs, local_worker=False, healthy_only=False) - - logger.info("Torch process group init completed") - - @override(PPO) - def training_step(self) -> ResultDict: - self.workers.foreach_worker_async( - func=self._sample_and_train_torch_distributed, - healthy_only=False, - ) - sample_and_update_results = self.workers.fetch_ready_async_reqs( - timeout_seconds=0.03 - ) - - # For all results collected: - # - Update our counters and timers. - # - Update the worker's global_vars. - # - Build info dict using a LearnerInfoBuilder object. - learner_info_builder = LearnerInfoBuilder(num_devices=1) - sampled_workers = set() - for worker_id, result in sample_and_update_results: - sampled_workers.add(worker_id) - - self._counters[NUM_AGENT_STEPS_SAMPLED] += result["agent_steps"] - self._counters[NUM_AGENT_STEPS_TRAINED] += result["agent_steps"] - self._counters[NUM_ENV_STEPS_SAMPLED] += result["env_steps"] - self._counters[NUM_ENV_STEPS_TRAINED] += result["env_steps"] - self._timers[LEARN_ON_BATCH_TIMER].push(result["learn_on_batch_time"]) - self._timers[SAMPLE_TIMER].push(result["sample_time"]) - - # Add partial learner info to builder object. - learner_info_builder.add_learn_on_batch_results_multi_agent(result["info"]) - - # Broadcast the local set of global vars to this worker. - global_vars = {"timestep": self._counters[NUM_AGENT_STEPS_SAMPLED]} - self.workers.foreach_worker( - func=lambda w: w.set_global_vars(global_vars), - local_worker=False, - remote_worker_ids=list(sampled_workers), - timeout_seconds=0, # Don't wait for workers to finish. - ) - - # Sync down the weights from 1st remote worker (only if we have received - # some results from it). - # As with the sync up, this is not really needed unless the user is - # reading the local weights. - worker_ids = self.workers.healthy_worker_ids() - assert worker_ids, "No healthy rollout workers?" - if self.config.keep_local_weights_in_sync and worker_ids[0] in sampled_workers: - weights = self.workers.foreach_worker( - func=lambda w: w.get_weights(), - local_worker=False, - remote_worker_ids=[worker_ids[0]], - ) - self.workers.local_worker().set_weights(weights[0]) - # Return merged laarner into results. - new_learner_info = learner_info_builder.finalize() - if new_learner_info: - self._curr_learner_info = new_learner_info - return self._curr_learner_info - - @staticmethod - def _sample_and_train_torch_distributed(worker: RolloutWorker): - # This function is applied remotely on each rollout worker. - config: AlgorithmConfig = worker.config - - # Generate a sample. - start = time.perf_counter() - batch = worker.sample() - sample_time = time.perf_counter() - start - expected_batch_size = ( - config.get_rollout_fragment_length() * config.num_envs_per_worker - ) - assert batch.count == expected_batch_size, ( - "Batch size possibly out of sync between workers, expected:", - expected_batch_size, - "got:", - batch.count, - ) - - # Perform n minibatch SGD update(s) on the worker itself. - start = time.perf_counter() - info = do_minibatch_sgd( - batch, - worker.policy_map, - worker, - config.num_sgd_iter, - config.sgd_minibatch_size, - [Postprocessing.ADVANTAGES], - ) - learn_on_batch_time = time.perf_counter() - start - return { - "info": info, - "env_steps": batch.env_steps(), - "agent_steps": batch.agent_steps(), - "sample_time": sample_time, - "learn_on_batch_time": learn_on_batch_time, - } diff --git a/rllib_contrib/ddppo/tests/test_ddppo.py b/rllib_contrib/ddppo/tests/test_ddppo.py deleted file mode 100644 index d84d26a0f1123..0000000000000 --- a/rllib_contrib/ddppo/tests/test_ddppo.py +++ /dev/null @@ -1,81 +0,0 @@ -import unittest - -import pytest -from rllib_ddppo.ddppo import DDPPOConfig - -import ray -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY -from ray.rllib.utils.test_utils import ( - check, - check_compute_single_action, - check_train_results, - framework_iterator, -) - - -class TestDDPPO(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_ddppo_compilation(self): - """Test whether DDPPO can be built with both frameworks.""" - config = DDPPOConfig().resources(num_gpus_per_worker=0) - - num_iterations = 2 - - for _ in framework_iterator(config, frameworks="torch"): - algo = config.build(env="CartPole-v1") - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - # Make sure, weights on all workers are the same. - weights = algo.workers.foreach_worker(lambda w: w.get_weights()) - for w in weights[1:]: - check(w, weights[1]) - - check_compute_single_action(algo) - algo.stop() - - def test_ddppo_schedule(self): - """Test whether lr_schedule will anneal lr to 0""" - config = DDPPOConfig() - config.resources(num_gpus_per_worker=0) - config.training(lr_schedule=[[0, config.lr], [1000, 0.0]]) - - num_iterations = 10 - - for _ in framework_iterator(config, "torch"): - algo = config.build(env="CartPole-v1") - lr = -100.0 - for _ in range(num_iterations): - result = algo.train() - if result["info"][LEARNER_INFO]: - lr = result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][ - LEARNER_STATS_KEY - ]["cur_lr"] - algo.stop() - assert lr == 0.0, "lr should anneal to 0.0" - - def test_validate_config(self): - """Test if DDPPO will raise errors after invalid configs are passed.""" - config = DDPPOConfig().training(kl_coeff=1.0) - msg = "DDPPO doesn't support KL penalties like PPO-1" - with pytest.raises(ValueError, match=msg): - config.build(env="CartPole-v1") - config.kl_coeff = 0.0 - config.kl_target = 1.0 - with pytest.raises(ValueError, match=msg): - config.build(env="CartPole-v1") - - -if __name__ == "__main__": - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/ddppo/tuned_examples/__init__.py b/rllib_contrib/ddppo/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/ddppo/tuned_examples/atari-ddppo.yaml b/rllib_contrib/ddppo/tuned_examples/atari-ddppo.yaml deleted file mode 100644 index 055c02d210fc6..0000000000000 --- a/rllib_contrib/ddppo/tuned_examples/atari-ddppo.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# Basically the same as atari-ppo, but adapted for DDPPO. Note that DDPPO -# isn't actually any more efficient on Atari, since the network size is -# relatively small and the env doesn't require a GPU. -atari-ddppo: - env: ALE/Breakout-v5 - run: DDPPO - config: - # DDPPO only supports PyTorch so far. - framework: torch - env_config: - frameskip: 1 # no frameskip - # Worker config: 10 workers, each of which requires a GPU. - num_workers: 10 - num_gpus_per_worker: 1 - # Each worker will sample 100 * 5 envs per worker steps = 500 steps - # per optimization round. This is 5000 steps summed across workers. - train_batch_size: 500 - num_envs_per_worker: 5 - # Each worker will take a minibatch of 50. There are 10 workers total, - # so the effective minibatch size will be 500. - sgd_minibatch_size: 50 - num_sgd_iter: 10 - # Params from standard PPO Atari config: - lambda: 0.95 - kl_coeff: 0.5 - clip_rewards: True - clip_param: 0.1 - vf_clip_param: 10.0 - entropy_coeff: 0.01 - batch_mode: truncate_episodes - observation_filter: NoFilter - model: - vf_share_layers: true diff --git a/rllib_contrib/ddppo/tuned_examples/cartpole-ddppo.yaml b/rllib_contrib/ddppo/tuned_examples/cartpole-ddppo.yaml deleted file mode 100644 index fb5238b4a9503..0000000000000 --- a/rllib_contrib/ddppo/tuned_examples/cartpole-ddppo.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-ddppo: - env: CartPole-v1 - run: DDPPO - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 100000 - config: - framework: torch - num_gpus_per_worker: 0 diff --git a/rllib_contrib/ddppo/tuned_examples/pendulum-ddppo.yaml b/rllib_contrib/ddppo/tuned_examples/pendulum-ddppo.yaml deleted file mode 100644 index 8ac14e18ecf11..0000000000000 --- a/rllib_contrib/ddppo/tuned_examples/pendulum-ddppo.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pendulum-ddppo: - env: Pendulum-v1 - run: DDPPO - stop: - sampler_results/episode_reward_mean: -300 - timesteps_total: 1500000 - config: - framework: torch - train_batch_size: 2500 # per worker - num_gpus_per_worker: 0 - num_workers: 4 - num_envs_per_worker: 10 - observation_filter: MeanStdFilter - gamma: 0.95 - sgd_minibatch_size: 50 - num_sgd_iter: 5 - clip_param: 0.3 - vf_clip_param: 10.0 - lambda: 0.1 - lr: 0.00015 diff --git a/rllib_contrib/dt/BUILD b/rllib_contrib/dt/BUILD deleted file mode 100644 index 1ada65a7882f2..0000000000000 --- a/rllib_contrib/dt/BUILD +++ /dev/null @@ -1,75 +0,0 @@ -# Examples - -py_test( - name = "example_dt_cartpole_v1", - main = "dt_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/dt_cartpole_v1.py"], - args = ["--run-as-test"] -) - -py_test( - name = "policy_inference_after_training_with_dt_torch", - main = "policy_inference_after_training_with_dt.py", - tags = ["team:rllib", "examples", "ray_data"], - size = "medium", - srcs = ["examples/policy_inference_after_training_with_dt.py"], -) - -# Learning Tests - -# py_test( -# name = "learning_tests_pendulum_dt", -# main = "run_regression_tests.py", -# tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], -# size = "large", -# srcs = ["run_regression_tests.py"], -# data = [ -# "tuned_examples/pendulum-v1-dt.yaml", -# # Include the offline json data file as well. -# "tuned_examples/pendulum_expert_sac_50eps.zip", -# ], -# args = ["--dir=dt/tuned_examples/"] -# ) - -py_test( - name = "learning_tests_cartpole_dt", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], - size = "medium", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-v1-dt.yaml"], - args = ["--dir=dt/tuned_examples/"] -) - - -# Compilation Tests - -py_test( - name = "test_dt", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_dt.py"] -) - -py_test( - name = "test_dt_model", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_dt_model.py"] -) - -py_test( - name = "test_dt_policy", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_dt_policy.py"] -) - -py_test( - name = "test_segmentation_buffer", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_segmentation_buffer.py"] -) diff --git a/rllib_contrib/dt/README.md b/rllib_contrib/dt/README.md deleted file mode 100644 index 4e2c4a549ec7f..0000000000000 --- a/rllib_contrib/dt/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Decision Transformer - -[Decision Transformer](https://arxiv.org/abs/2106.01345) is an offline-rl algorithm that trains a transformer to generate -optimal actions based on desired returns, past states, and actions. - - -## Installation - -``` -conda create -n rllib-dt python=3.10 -conda activate rllib-dt -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[DT Example]() \ No newline at end of file diff --git a/rllib_contrib/dt/examples/dt_cartpole_v1.py b/rllib_contrib/dt/examples/dt_cartpole_v1.py deleted file mode 100644 index a82b40106a326..0000000000000 --- a/rllib_contrib/dt/examples/dt_cartpole_v1.py +++ /dev/null @@ -1,84 +0,0 @@ -import argparse - -from rllib_dt.dt import DT, DTConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - config = ( - DTConfig() - .environment(env="CartPole-v1", clip_actions=True) - .framework("torch") - .offline_data( - input_="dataset", - input_config={ - "format": "json", - "paths": ["s3://anonymous@air-example-data/rllib/cartpole/large.json"], - }, - actions_in_input_normalized=True, - ) - .training( - train_batch_size=512, - lr=0.01, - optimizer={ - "weight_decay": 0.1, - "betas": [0.9, 0.999], - }, - replay_buffer_config={"capacity": 20}, - # model - model={"max_seq_len": 3}, - num_layers=1, - num_heads=1, - embed_dim=64, - horizon=500, - ) - .evaluation( - evaluation_interval=1, - evaluation_num_workers=1, - evaluation_duration=10, - target_return=200, - evaluation_duration_unit="episodes", - evaluation_parallel_to_training=True, - evaluation_config=DTConfig.overrides(input_="sampler", explore=False), - ) - # Episode horizon: Must match environment's time limit, if any. - .rollouts(num_rollout_workers=3) - .reporting(min_train_timesteps_per_iteration=5000) - ) - - stop_reward = 200 - - tuner = tune.Tuner( - DT, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "evaluation/sampler_results/episode_reward_mean": stop_reward, - "training_iteration": 100, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved( - results, - stop_reward, - metric="evaluation/sampler_results/episode_reward_mean", - ) diff --git a/rllib_contrib/dt/examples/policy_inference_after_training_with_dt.py b/rllib_contrib/dt/examples/policy_inference_after_training_with_dt.py deleted file mode 100644 index 9de4bc3e0641a..0000000000000 --- a/rllib_contrib/dt/examples/policy_inference_after_training_with_dt.py +++ /dev/null @@ -1,175 +0,0 @@ -""" -Example showing how you can use your trained Decision Transformer (DT) policy for -inference (computing actions) in an environment. -""" -import argparse -import os -from pathlib import Path - -import gymnasium as gym -from rllib_dt.dt import DTConfig - -import ray -from ray import air, tune -from ray.rllib.algorithms.algorithm import Algorithm -from ray.tune.utils.log import Verbosity - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--num-cpus", type=int, default=0) - parser.add_argument( - "--input-files", - nargs="+", - default=["s3://anonymous@air-example-data/rllib/cartpole/large.json"], - help="List of paths to offline json files/zips for training.", - ) - parser.add_argument( - "--num-episodes-during-inference", - type=int, - default=10, - help="Number of episodes to do inference over after training.", - ) - - args = parser.parse_args() - - ray.init(num_cpus=args.num_cpus or None) - - # Bazel makes it hard to find files specified in `args` (and `data`). - # Look for them here. - input_files = [] - for input_file in args.input_files: - if not os.path.exists(input_file) and not input_file.startswith("s3"): - # This script runs in the ray/rllib/examples/inference_and_serving dir. - rllib_dir = Path(__file__).parent.parent.parent - input_dir = rllib_dir.absolute().joinpath(input_file) - input_files.append(str(input_dir)) - else: - input_files.append(input_file) - - # Get max_ep_len - env = gym.make("CartPole-v1") - max_ep_len = env.spec.max_episode_steps - env.close() - - # Training config - config = ( - DTConfig() - .environment( - env="CartPole-v1", - clip_actions=False, - normalize_actions=False, - ) - .framework("torch") - .offline_data( - input_="dataset", - input_config={ - "format": "json", - "paths": input_files, - }, - actions_in_input_normalized=True, - ) - .training( - horizon=max_ep_len, # This needs to be specified for DT to work. - lr=0.01, - optimizer={ - "weight_decay": 0.1, - "betas": [0.9, 0.999], - }, - train_batch_size=512, - replay_buffer_config={ - "capacity": 20, - }, - model={ - "max_seq_len": 3, - }, - num_layers=1, - num_heads=1, - embed_dim=64, - ) - # Need to do evaluation rollouts for stopping condition. - .evaluation( - target_return=200.0, - evaluation_interval=1, - evaluation_num_workers=1, - evaluation_duration=10, - evaluation_duration_unit="episodes", - evaluation_parallel_to_training=False, - evaluation_config=DTConfig.overrides(input_="sampler", explore=False), - ) - .rollouts( - num_rollout_workers=0, - ) - .reporting( - min_train_timesteps_per_iteration=5000, - ) - .resources( - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")), - ) - ) - config = config.to_dict() - - # Configure when to stop training - # Note that for an offline RL algorithm, we don't do training rollouts, - # instead we have to rely on evaluation rollouts. - stop = { - "evaluation/episode_reward_mean": 200.0, - "training_iteration": 100, - } - - print("Training policy until desired reward/iterations. ...") - tuner = tune.Tuner( - "DT", - param_space=config, - run_config=air.RunConfig( - stop=stop, - verbose=Verbosity.V3_TRIAL_DETAILS, - checkpoint_config=air.CheckpointConfig( - checkpoint_frequency=1, - checkpoint_at_end=True, - ), - ), - ) - results = tuner.fit() - - print("Training completed. Restoring new Algorithm for action inference.") - # Get the last checkpoint from the above training run. - checkpoint = results.get_best_result().checkpoint - # Create new Algorithm and restore its state from the last checkpoint. - algo = Algorithm.from_checkpoint(checkpoint) - - # Create the env to do inference in. - env = gym.make("CartPole-v1") - - obs, info = env.reset() - input_dict = algo.get_initial_input_dict(obs) - - num_episodes = 0 - total_rewards = 0.0 - - while num_episodes < args.num_episodes_during_inference: - # Compute an action (`a`). - a, _, extra = algo.compute_single_action(input_dict=input_dict) - # Send the computed action `a` to the env. - obs, reward, terminated, truncated, _ = env.step(a) - # Add to total rewards. - total_rewards += reward - # Is the episode `done`? -> Reset. - if terminated or truncated: - print(f"Episode {num_episodes+1} - return: {total_rewards}") - obs, info = env.reset() - input_dict = algo.get_initial_input_dict(obs) - num_episodes += 1 - total_rewards = 0.0 - # Episode is still ongoing -> Continue. - else: - input_dict = algo.get_next_input_dict( - input_dict, - a, - reward, - obs, - extra, - ) - - env.close() - ray.shutdown() diff --git a/rllib_contrib/dt/pyproject.toml b/rllib_contrib/dt/pyproject.toml deleted file mode 100644 index be8ca6555564f..0000000000000 --- a/rllib_contrib/dt/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-dt" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "torch==1.12.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "numpy<2"] diff --git a/rllib_contrib/dt/requirements.txt b/rllib_contrib/dt/requirements.txt deleted file mode 100644 index b07006a1b4ec6..0000000000000 --- a/rllib_contrib/dt/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/dt/src/rllib_dt/dt/__init__.py b/rllib_contrib/dt/src/rllib_dt/dt/__init__.py deleted file mode 100644 index 6fec0f062596a..0000000000000 --- a/rllib_contrib/dt/src/rllib_dt/dt/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from rllib_dt.dt.dt import DT, DTConfig -from rllib_dt.dt.dt_torch_model import DTTorchModel -from rllib_dt.dt.dt_torch_policy import DTTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = ["DT", "DTConfig", "DTTorchModel", "DTTorchPolicy"] - -register_trainable("rllib-contrib-dt", DT) diff --git a/rllib_contrib/dt/src/rllib_dt/dt/dt.py b/rllib_contrib/dt/src/rllib_dt/dt/dt.py deleted file mode 100644 index 172b645fc00b9..0000000000000 --- a/rllib_contrib/dt/src/rllib_dt/dt/dt.py +++ /dev/null @@ -1,440 +0,0 @@ -import logging -import math -from typing import Any, Dict, List, Optional, Tuple, Type, Union - -from rllib_dt.dt.segmentation_buffer import MultiAgentSegmentationBuffer - -from ray.rllib import SampleBatch -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.execution import synchronous_parallel_sample -from ray.rllib.execution.train_ops import multi_gpu_train_one_step, train_one_step -from ray.rllib.policy import Policy -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils import deep_update -from ray.rllib.utils.annotations import PublicAPI, override -from ray.rllib.utils.metrics import ( - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - SAMPLE_TIMER, -) -from ray.rllib.utils.typing import PolicyID, ResultDict, TensorStructType, TensorType - -logger = logging.getLogger(__name__) - - -class DTConfig(AlgorithmConfig): - def __init__(self, algo_class=None): - super().__init__(algo_class=algo_class or DT) - - # fmt: off - # __sphinx_doc_begin__ - # DT-specific settings. - # Required settings during training and evaluation: - # Initial return to go used as target during rollout. - self.target_return = None - # Rollout horizon/maximum episode length. - self.horizon = None - - # Model settings: - self.model = { - # Transformer (GPT) context length. - "max_seq_len": 5, - } - - # Transformer (GPT) settings: - self.embed_dim = 128 - self.num_layers = 2 - self.num_heads = 1 - self.embed_pdrop = 0.1 - self.resid_pdrop = 0.1 - self.attn_pdrop = 0.1 - - # Optimization settings: - self.lr = 1e-4 - self.lr_schedule = None - self.optimizer = { - # Weight decay for Adam optimizer. - "weight_decay": 1e-4, - # Betas for Adam optimizer. - "betas": (0.9, 0.95), - } - self.grad_clip = None - # Coefficients on the loss for each of the heads. - # By default, only use the actions outputs for training. - self.loss_coef_actions = 1 - self.loss_coef_obs = 0 - self.loss_coef_returns_to_go = 0 - - self.replay_buffer_config = { - # How many trajectories/episodes does the segmentation buffer hold. - # Increase for more data shuffling but increased memory usage. - "capacity": 20, - # Do not change the type of replay buffer. - "type": MultiAgentSegmentationBuffer, - } - # __sphinx_doc_end__ - # fmt: on - - # Overwriting the trainer config default - # Number of training_step calls between evaluation rollouts. - self.min_train_timesteps_per_iteration = 5000 - - # Don't change - self.offline_sampling = True - self.postprocess_inputs = True - self.discount = None - - def training( - self, - *, - replay_buffer_config: Optional[Dict[str, Any]] = NotProvided, - embed_dim: Optional[int] = NotProvided, - num_layers: Optional[int] = NotProvided, - num_heads: Optional[int] = NotProvided, - embed_pdrop: Optional[float] = NotProvided, - resid_pdrop: Optional[float] = NotProvided, - attn_pdrop: Optional[float] = NotProvided, - grad_clip: Optional[float] = NotProvided, - loss_coef_actions: Optional[float] = NotProvided, - loss_coef_obs: Optional[float] = NotProvided, - loss_coef_returns_to_go: Optional[float] = NotProvided, - lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, - horizon: Optional[int] = NotProvided, - **kwargs, - ) -> "DTConfig": - """ - === DT configs - - Args: - replay_buffer_config: Replay buffer config. - Examples: - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentReplayBuffer", - "capacity": 50000, - "replay_sequence_length": 1, - } - - OR - - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 50000, - "prioritized_replay_alpha": 0.6, - "prioritized_replay_beta": 0.4, - "prioritized_replay_eps": 1e-6, - "replay_sequence_length": 1, - } - - Where - - prioritized_replay_alpha: Alpha parameter controls the degree of - prioritization in the buffer. In other words, when a buffer sample has - a higher temporal-difference error, with how much more probability - should it drawn to use to update the parametrized Q-network. 0.0 - corresponds to uniform probability. Setting much above 1.0 may quickly - result as the sampling distribution could become heavily “pointy” with - low entropy. - prioritized_replay_beta: Beta parameter controls the degree of - importance sampling which suppresses the influence of gradient updates - from samples that have higher probability of being sampled via alpha - parameter and the temporal-difference error. - prioritized_replay_eps: Epsilon parameter sets the baseline probability - for sampling so that when the temporal-difference error of a sample is - zero, there is still a chance of drawing the sample. - embed_dim: Dimension of the embeddings in the GPT model. - num_layers: Number of attention layers in the GPT model. - num_heads: Number of attention heads in the GPT model. Must divide - embed_dim evenly. - embed_pdrop: Dropout probability of the embedding layer of the GPT model. - resid_pdrop: Dropout probability of the residual layer of the GPT model. - attn_pdrop: Dropout probability of the attention layer of the GPT model. - grad_clip: If specified, clip the global norm of gradients by this amount. - lr_schedule: Learning rate schedule. In the format of - [[timestep, lr-value], [timestep, lr-value], ...] - Intermediary timesteps will be assigned to interpolated learning rate - values. A schedule should normally start from timestep 0. - loss_coef_actions: Coefficients on the loss for the actions output. - Default to 1. - loss_coef_obs: Coefficients on the loss for the obs output. Default to 0. - Set to a value greater than 0 to regress on the obs output. - loss_coef_returns_to_go: Coefficients on the loss for the returns_to_go - output. Default to 0. Set to a value greater than 0 to regress on the - returns_to_go output. - horizon: The episode horizon used. This value can be derived from your - environment via `[your_env]._max_episode_steps`. - **kwargs: Forward compatibility kwargs - - Returns: - This updated DTConfig object. - """ - super().training(**kwargs) - if replay_buffer_config is not NotProvided: - # Override entire `replay_buffer_config` if `type` key changes. - # Update, if `type` key remains the same or is not specified. - new_replay_buffer_config = deep_update( - {"replay_buffer_config": self.replay_buffer_config}, - {"replay_buffer_config": replay_buffer_config}, - False, - ["replay_buffer_config"], - ["replay_buffer_config"], - ) - self.replay_buffer_config = new_replay_buffer_config["replay_buffer_config"] - if embed_dim is not NotProvided: - self.embed_dim = embed_dim - if num_layers is not NotProvided: - self.num_layers = num_layers - if num_heads is not NotProvided: - self.num_heads = num_heads - if embed_pdrop is not NotProvided: - self.embed_pdrop = embed_pdrop - if resid_pdrop is not NotProvided: - self.resid_pdrop = resid_pdrop - if attn_pdrop is not NotProvided: - self.attn_pdrop = attn_pdrop - if grad_clip is not NotProvided: - self.grad_clip = grad_clip - if lr_schedule is not NotProvided: - self.lr_schedule = lr_schedule - if loss_coef_actions is not NotProvided: - self.loss_coef_actions = loss_coef_actions - if loss_coef_obs is not NotProvided: - self.loss_coef_obs = loss_coef_obs - if loss_coef_returns_to_go is not NotProvided: - self.loss_coef_returns_to_go = loss_coef_returns_to_go - if horizon is not NotProvided: - self.horizon = horizon - - return self - - def evaluation( - self, - *, - target_return: Optional[float] = NotProvided, - **kwargs, - ) -> "DTConfig": - """ - === DT configs - - Args: - target_return: The target return-to-go for inference/evaluation. - **kwargs: Forward compatibility kwargs - - Returns: - This updated DTConfig object. - """ - super().evaluation(**kwargs) - if target_return is not NotProvided: - self.target_return = target_return - - return self - - @override(AlgorithmConfig) - def rollouts(self, *args, **kwargs): - if "horizon" in kwargs: - raise ValueError( - "`horizon` setting no longer supported via " - "`config.rollouts(horizon=..)`! This is a DT-only setting now and " - "must be specified via `config.training(horizon=..)`." - ) - return super().rollouts(*args, **kwargs) - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - # target_return must be specified - assert ( - self.target_return is not None - ), "Must specify a target return (total sum of rewards)." - - # horizon must be specified and >= 2 - assert self.horizon is not None, "Must specify rollout horizon." - assert self.horizon >= 2, "rollout horizon must be at least 2." - - # replay_buffer's type must be MultiAgentSegmentationBuffer - assert ( - self.replay_buffer_config is not None - ), "Must specify replay_buffer_config." - replay_buffer_type = self.replay_buffer_config.get("type") - assert ( - replay_buffer_type == MultiAgentSegmentationBuffer - ), "replay_buffer's type must be MultiAgentSegmentationBuffer." - - # max_seq_len must be specified in model - model_max_seq_len = self.model.get("max_seq_len") - assert model_max_seq_len is not None, "Must specify model's max_seq_len." - - # User shouldn't need to specify replay_buffer's max_seq_len. - # Autofill for replay buffer API. If they did specify, make sure it - # matches with model's max_seq_len - buffer_max_seq_len = self.replay_buffer_config.get("max_seq_len") - if buffer_max_seq_len is None: - self.replay_buffer_config["max_seq_len"] = model_max_seq_len - else: - assert ( - buffer_max_seq_len == model_max_seq_len - ), "replay_buffer's max_seq_len must equal model's max_seq_len." - - # Same thing for buffer's max_ep_len, which should be autofilled from - # rollout's horizon, or check that it matches if user specified. - buffer_max_ep_len = self.replay_buffer_config.get("max_ep_len") - if buffer_max_ep_len is None: - self.replay_buffer_config["max_ep_len"] = self.horizon - else: - assert ( - buffer_max_ep_len == self.horizon - ), "replay_buffer's max_ep_len must equal rollout horizon." - - -class DT(Algorithm): - """Implements Decision Transformer: https://arxiv.org/abs/2106.01345.""" - - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return DTConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - from rllib_dt.dt.dt_torch_policy import DTTorchPolicy - - return DTTorchPolicy - else: - raise ValueError("Non-torch frameworks are not supported yet!") - - @override(Algorithm) - def training_step(self) -> ResultDict: - with self._timers[SAMPLE_TIMER]: - # TODO: Add ability to do obs_filter for offline sampling. - train_batch = synchronous_parallel_sample(worker_set=self.workers) - - train_batch = train_batch.as_multi_agent() - self._counters[NUM_AGENT_STEPS_SAMPLED] += train_batch.agent_steps() - self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps() - - # Because each sample is a segment of max_seq_len transitions, doing - # the division makes it so the total number of transitions per train - # step is consistent. - num_steps = train_batch.env_steps() - batch_size = int(math.ceil(num_steps / self.config.model["max_seq_len"])) - - # Add the batch of episodes to the segmentation buffer. - self.local_replay_buffer.add(train_batch) - # Sample a batch of segments. - train_batch = self.local_replay_buffer.sample(batch_size) - - # Postprocess batch before we learn on it. - post_fn = self.config.get("before_learn_on_batch") or (lambda b, *a: b) - train_batch = post_fn(train_batch, self.workers, self.config) - - # Learn on training batch. - # Use simple optimizer (only for multi-agent or tf-eager; all other - # cases should use the multi-GPU optimizer, even if only using 1 GPU) - if self.config.get("simple_optimizer", False): - train_results = train_one_step(self, train_batch) - else: - train_results = multi_gpu_train_one_step(self, train_batch) - - # Update learning rate scheduler. - global_vars = { - # Note: this counts the number of segments trained, not timesteps. - # i.e. NUM_AGENT_STEPS_TRAINED: B, NUM_AGENT_STEPS_SAMPLED: B*T - "timestep": self._counters[NUM_AGENT_STEPS_TRAINED], - } - self.workers.local_worker().set_global_vars(global_vars) - - return train_results - - @PublicAPI - @override(Algorithm) - def compute_single_action( - self, - *args, - input_dict: Optional[SampleBatch] = None, - full_fetch: bool = True, - **kwargs, - ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]: - """Computes an action for the specified policy on the local worker. - - Note that you can also access the policy object through - self.get_policy(policy_id) and call compute_single_action() on it - directly. - - Args: - input_dict: A SampleBatch taken from get_initial_input_dict or - get_next_input_dict. - full_fetch: Whether to return extra action fetch results. - This is always True for DT. - kwargs: forward compatibility args. - - Returns: - A tuple containing: ( - the computed action, - list of RNN states (empty for DT), - extra action output (pass to get_next_input_dict), - ) - """ - assert input_dict is not None, ( - "DT must take in input_dict for inference. " - "See get_initial_input_dict() and get_next_input_dict()." - ) - assert ( - full_fetch - ), "DT needs full_fetch=True. Pass extra into get_next_input_dict()." - - return super().compute_single_action( - *args, input_dict=input_dict.copy(), full_fetch=full_fetch, **kwargs - ) - - @PublicAPI - def get_initial_input_dict( - self, - observation: TensorStructType, - policy_id: PolicyID = DEFAULT_POLICY_ID, - ) -> SampleBatch: - """Get the initial input_dict to be passed into compute_single_action. - - Args: - observation: first (unbatched) observation from env.reset() - policy_id: Policy to query (only applies to multi-agent). - Default: "default_policy". - - Returns: - The input_dict for inference. - """ - policy = self.get_policy(policy_id) - return policy.get_initial_input_dict(observation) - - @PublicAPI - def get_next_input_dict( - self, - input_dict: SampleBatch, - action: TensorStructType, - reward: TensorStructType, - next_obs: TensorStructType, - extra: Dict[str, TensorType], - policy_id: PolicyID = DEFAULT_POLICY_ID, - ) -> SampleBatch: - """Returns a new input_dict after stepping through the environment once. - - Args: - input_dict: the input dict passed into compute_single_action. - action: the (unbatched) action taken this step. - reward: the (unbatched) reward from env.step - next_obs: the (unbatached) next observation from env.step - extra: the extra action out from compute_single_action. - For DT this case contains current returns to go *before* the current - reward is subtracted from target_return. - policy_id: Policy to query (only applies to multi-agent). - Default: "default_policy". - - Returns: - A new input_dict to be passed into compute_single_action. - """ - policy = self.get_policy(policy_id) - return policy.get_next_input_dict(input_dict, action, reward, next_obs, extra) diff --git a/rllib_contrib/dt/src/rllib_dt/dt/dt_torch_model.py b/rllib_contrib/dt/src/rllib_dt/dt/dt_torch_model.py deleted file mode 100644 index 55b1c58ee8e1f..0000000000000 --- a/rllib_contrib/dt/src/rllib_dt/dt/dt_torch_model.py +++ /dev/null @@ -1,237 +0,0 @@ -from typing import Dict, List - -import gymnasium as gym -import numpy as np -from gymnasium.spaces import Box, Discrete - -from ray.rllib import SampleBatch -from ray.rllib.models import ModelV2 -from ray.rllib.models.torch.mingpt import GPT, GPTConfig -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.policy.view_requirement import ViewRequirement -from ray.rllib.utils import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.typing import ModelConfigDict, TensorType - -torch, nn = try_import_torch() - - -class DTTorchModel(TorchModelV2, nn.Module): - def __init__( - self, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - num_outputs: int, - model_config: ModelConfigDict, - name: str, - ): - TorchModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - nn.Module.__init__(self) - - self.obs_dim = num_outputs - - if isinstance(action_space, Discrete): - self.action_dim = action_space.n - elif isinstance(action_space, Box): - self.action_dim = np.prod(action_space.shape) - else: - raise NotImplementedError - - # Common model parameters - self.embed_dim = self.model_config["embed_dim"] - self.max_seq_len = self.model_config["max_seq_len"] - self.max_ep_len = self.model_config["max_ep_len"] - self.block_size = self.model_config["max_seq_len"] * 3 - - # Build all the nn modules - self.transformer = self.build_transformer() - self.position_encoder = self.build_position_encoder() - self.action_encoder = self.build_action_encoder() - self.obs_encoder = self.build_obs_encoder() - self.return_encoder = self.build_return_encoder() - self.action_head = self.build_action_head() - self.obs_head = self.build_obs_head() - self.return_head = self.build_return_head() - - # Update view requirement - # NOTE: See DTTorchPolicy.action_distribution_fn for an explanation of - # why the ViewRequirements are like this - self.view_requirements = { - SampleBatch.OBS: ViewRequirement( - space=obs_space, shift=f"-{self.max_seq_len-1}:0" - ), - SampleBatch.ACTIONS: ViewRequirement( - space=action_space, shift=f"-{self.max_seq_len-1}:-1" - ), - SampleBatch.REWARDS: ViewRequirement(shift=-1), - SampleBatch.T: ViewRequirement(shift=f"-{self.max_seq_len-2}:0"), - SampleBatch.RETURNS_TO_GO: ViewRequirement( - shift=f"-{self.max_seq_len-1}:-1" - ), - } - - def build_transformer(self): - # build the model - gpt_config = GPTConfig( - block_size=self.block_size, - n_layer=self.model_config["num_layers"], - n_head=self.model_config["num_heads"], - n_embed=self.embed_dim, - embed_pdrop=self.model_config["embed_pdrop"], - resid_pdrop=self.model_config["resid_pdrop"], - attn_pdrop=self.model_config["attn_pdrop"], - ) - gpt = GPT(gpt_config) - return gpt - - def build_position_encoder(self): - return nn.Embedding(self.max_ep_len, self.embed_dim) - - def build_action_encoder(self): - if isinstance(self.action_space, Discrete): - return nn.Embedding(self.action_dim, self.embed_dim) - elif isinstance(self.action_space, Box): - return nn.Linear(self.action_dim, self.embed_dim) - else: - raise NotImplementedError - - def build_obs_encoder(self): - return nn.Linear(self.obs_dim, self.embed_dim) - - def build_return_encoder(self): - return nn.Linear(1, self.embed_dim) - - def build_action_head(self): - return nn.Linear(self.embed_dim, self.action_dim) - - def build_obs_head(self): - if not self.model_config["use_obs_output"]: - return None - return nn.Linear(self.embed_dim, self.obs_dim) - - def build_return_head(self): - if not self.model_config["use_return_output"]: - return None - return nn.Linear(self.embed_dim, 1) - - @override(ModelV2) - def forward( - self, - input_dict: Dict[str, TensorType], - state: List[TensorType], - seq_lens: TensorType, - ) -> (TensorType, List[TensorType]): - # True No-op forward method. - # TODO: Support image observation inputs - return input_dict["obs"], state - - def get_prediction( - self, - model_out: TensorType, - input_dict: SampleBatch, - return_attentions: bool = False, - ) -> Dict[str, TensorType]: - """Computes the output of a forward pass of the decision transformer. - - Args: - model_out: output observation tensor from the base model, [B, T, obs_dim]. - input_dict: a SampleBatch containing - RETURNS_TO_GO: [B, T (or T + 1), 1] of returns to go values. - ACTIONS: [B, T, action_dim] of actions. - T: [B, T] of timesteps. - ATTENTION_MASKS: [B, T] of attention masks. - return_attentions: Whether to return the attention tensors from the - transformer or not. - - Returns: - A dictionary with keys and values: - ACTIONS: [B, T, action_dim] of predicted actions. - if return_attentions: - "attentions": List of attentions tensors from the transformer. - if model_config["use_obs_output"]. - OBS: [B, T, obs_dim] of predicted observations. - if model_config["use_return_output"]. - RETURNS_to_GO: [B, T, 1] of predicted returns to go. - """ - B, T, *_ = model_out.shape - - obs_embeds = self.obs_encoder(model_out) - actions_embeds = self.action_encoder(input_dict[SampleBatch.ACTIONS]) - # Note: rtg might have an extra element at the end for targets - # During training rtg will have T + 1 for its time dimension to get the - # rtg regression target. During evaluation/inference rtg will have T for - # its time dimension as we don't need to call get_targets. - returns_embeds = self.return_encoder( - input_dict[SampleBatch.RETURNS_TO_GO][:, :T, :] - ) - timestep_embeds = self.position_encoder(input_dict[SampleBatch.T]) - - obs_embeds = obs_embeds + timestep_embeds - actions_embeds = actions_embeds + timestep_embeds - returns_embeds = returns_embeds + timestep_embeds - - # This makes the sequence look like (R_1, s_1, a_1, R_2, s_2, a_2, ...) - stacked_inputs = torch.stack( - (returns_embeds, obs_embeds, actions_embeds), dim=2 - ).reshape(B, 3 * T, self.embed_dim) - - attention_masks = input_dict[SampleBatch.ATTENTION_MASKS] - stacked_attention_masks = torch.stack( - (attention_masks, attention_masks, attention_masks), dim=2 - ).reshape(B, 3 * T) - - # forward the transformer model - output_embeds = self.transformer( - stacked_inputs, - attention_masks=stacked_attention_masks, - return_attentions=return_attentions, - ) - - outputs = {} - if return_attentions: - output_embeds, attentions = output_embeds - outputs["attentions"] = attentions - - # compute output heads - outputs[SampleBatch.ACTIONS] = self.action_head(output_embeds[:, 1::3, :]) - if self.model_config["use_obs_output"]: - outputs[SampleBatch.OBS] = self.obs_head(output_embeds[:, 0::3, :]) - if self.model_config["use_return_output"]: - outputs[SampleBatch.RETURNS_TO_GO] = self.return_head( - output_embeds[:, 2::3, :] - ) - - return outputs - - def get_targets( - self, model_out: TensorType, input_dict: SampleBatch - ) -> Dict[str, TensorType]: - """Compute the target predictions for a given input_dict. - - Args: - model_out: output observation tensor from the base model, [B, T, obs_dim]. - input_dict: a SampleBatch containing - RETURNS_TO_GO: [B, T + 1, 1] of returns to go values. - ACTIONS: [B, T, action_dim] of actions. - T: [B, T] of timesteps. - ATTENTION_MASKS: [B, T] of attention masks. - - Returns: - A dictionary with keys and values: - ACTIONS: [B, T, action_dim] of target actions. - if model_config["use_obs_output"] - OBS: [B, T, obs_dim] of target observations. - if model_config["use_return_output"] - RETURNS_to_GO: [B, T, 1] of target returns to go. - """ - targets = {SampleBatch.ACTIONS: input_dict[SampleBatch.ACTIONS].detach()} - if self.model_config["use_obs_output"]: - targets[SampleBatch.OBS] = model_out.detach() - if self.model_config["use_return_output"]: - targets[SampleBatch.RETURNS_TO_GO] = input_dict[SampleBatch.RETURNS_TO_GO][ - :, 1:, : - ].detach() - - return targets diff --git a/rllib_contrib/dt/src/rllib_dt/dt/dt_torch_policy.py b/rllib_contrib/dt/src/rllib_dt/dt/dt_torch_policy.py deleted file mode 100644 index e3f3323ff89f9..0000000000000 --- a/rllib_contrib/dt/src/rllib_dt/dt/dt_torch_policy.py +++ /dev/null @@ -1,576 +0,0 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union - -import gymnasium as gym -import numpy as np -import tree -from gymnasium.spaces import Box, Discrete -from rllib_dt.dt.dt_torch_model import DTTorchModel - -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.mingpt import configure_gpt_optimizer -from ray.rllib.models.torch.torch_action_dist import ( - TorchCategorical, - TorchDeterministic, - TorchDistributionWrapper, -) -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import LearningRateSchedule -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI, override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.threading import with_lock -from ray.rllib.utils.torch_utils import apply_grad_clipping -from ray.rllib.utils.typing import ( - TensorShape, - TensorStructType, - TensorType, - TrainerConfigDict, -) - -if TYPE_CHECKING: - from ray.rllib.evaluation import Episode # noqa - -torch, nn = try_import_torch() -F = nn.functional - - -class DTTorchPolicy(LearningRateSchedule, TorchPolicyV2): - def __init__( - self, - observation_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: TrainerConfigDict, - ): - LearningRateSchedule.__init__( - self, - config["lr"], - config["lr_schedule"], - ) - - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - - @override(TorchPolicyV2) - def make_model_and_action_dist( - self, - ) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]: - # Model - model_config = self.config["model"] - # TODO: make these better with better AlgorithmConfig options. - model_config.update( - embed_dim=self.config["embed_dim"], - max_ep_len=self.config["horizon"], - num_layers=self.config["num_layers"], - num_heads=self.config["num_heads"], - embed_pdrop=self.config["embed_pdrop"], - resid_pdrop=self.config["resid_pdrop"], - attn_pdrop=self.config["attn_pdrop"], - use_obs_output=self.config.get("loss_coef_obs", 0) > 0, - use_return_output=self.config.get("loss_coef_returns_to_go", 0) > 0, - ) - - num_outputs = int(np.prod(self.observation_space.shape)) - - model = ModelCatalog.get_model_v2( - obs_space=self.observation_space, - action_space=self.action_space, - num_outputs=num_outputs, - model_config=model_config, - framework=self.config["framework"], - model_interface=None, - default_model=DTTorchModel, - name="model", - ) - - # Action Distribution - if isinstance(self.action_space, Discrete): - action_dist = TorchCategorical - elif isinstance(self.action_space, Box): - action_dist = TorchDeterministic - else: - raise NotImplementedError - - return model, action_dist - - @override(TorchPolicyV2) - def optimizer( - self, - ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]: - optimizer = configure_gpt_optimizer( - model=self.model, - learning_rate=self.config["lr"], - weight_decay=self.config["optimizer"]["weight_decay"], - betas=self.config["optimizer"]["betas"], - ) - - return optimizer - - @override(TorchPolicyV2) - def postprocess_trajectory( - self, - sample_batch: SampleBatch, - other_agent_batches: Optional[Dict[Any, SampleBatch]] = None, - episode: Optional["Episode"] = None, - ) -> SampleBatch: - """Called by offline data reader after loading in one episode. - - Adds a `terminateds` flag at the end of trajectory so that SegmentationBuffer - can split using this flag to avoid duplicate trajectories. - """ - ep_len = sample_batch.env_steps() - sample_batch[SampleBatch.TERMINATEDS] = np.array( - [False] * (ep_len - 1) + [True] - ) - return sample_batch - - @PublicAPI - def get_initial_input_dict(self, observation: TensorStructType) -> SampleBatch: - """Get the initial input_dict to be passed into compute_single_action. - - Args: - observation: first (unbatched) observation from env.reset() - - Returns: - The input_dict for inference: { - OBS: [max_seq_len, obs_dim] array, - ACTIONS: [max_seq_len - 1, act_dim] array, - RETURNS_TO_GO: [max_seq_len - 1] array, - REWARDS: scalar, - TIMESTEPS: [max_seq_len - 1] array, - } - Note the sequence lengths are different, and is specified as per - view_requirements. Explanations in action_distribution_fn method. - """ - observation = convert_to_numpy(observation) - obs_shape = observation.shape - obs_dtype = observation.dtype - - act_shape = self.action_space.shape - act_dtype = self.action_space.dtype - - # Here we will pad all the required inputs to its proper sequence length - # as their ViewRequirement. - - observations = np.concatenate( - [ - np.zeros((self.max_seq_len - 1, *obs_shape), dtype=obs_dtype), - observation[None], - ], - axis=0, - ) - - actions = np.zeros((self.max_seq_len - 1, *act_shape), dtype=act_dtype) - - rtg = np.zeros(self.max_seq_len - 1, dtype=np.float32) - - rewards = np.zeros((), dtype=np.float32) - - # -1 for masking in action_distribution_fn - timesteps = np.full(self.max_seq_len - 1, fill_value=-1, dtype=np.int32) - - input_dict = SampleBatch( - { - SampleBatch.OBS: observations, - SampleBatch.ACTIONS: actions, - SampleBatch.RETURNS_TO_GO: rtg, - SampleBatch.REWARDS: rewards, - SampleBatch.T: timesteps, - } - ) - return input_dict - - @PublicAPI - def get_next_input_dict( - self, - input_dict: SampleBatch, - action: TensorStructType, - reward: TensorStructType, - next_obs: TensorStructType, - extra: Dict[str, TensorType], - ) -> SampleBatch: - """Returns a new input_dict after stepping through the environment once. - - Args: - input_dict: the input dict passed into compute_single_action. - action: the (unbatched) action taken this step. - reward: the (unbatched) reward from env.step - next_obs: the (unbatached) next observation from env.step - extra: the extra action out from compute_single_action. - In this case contains current returns to go *before* the current - reward is subtracted from target_return. - - Returns: - A new input_dict to be passed into compute_single_action. - The input_dict for inference: { - OBS: [max_seq_len, obs_dim] array, - ACTIONS: [max_seq_len - 1, act_dim] array, - RETURNS_TO_GO: [max_seq_len - 1] array, - REWARDS: scalar, - TIMESTEPS: [max_seq_len - 1] array, - } - Note the sequence lengths are different, and is specified as per - view_requirements. Explanations in action_distribution_fn method. - """ - # creates a copy of input_dict with only numpy arrays - input_dict = tree.map_structure(convert_to_numpy, input_dict) - # convert everything else to numpy as well - action, reward, next_obs, extra = convert_to_numpy( - (action, reward, next_obs, extra) - ) - - # check dimensions - assert input_dict[SampleBatch.OBS].shape == ( - self.max_seq_len, - *self.observation_space.shape, - ) - assert input_dict[SampleBatch.ACTIONS].shape == ( - self.max_seq_len - 1, - *self.action_space.shape, - ) - assert input_dict[SampleBatch.RETURNS_TO_GO].shape == (self.max_seq_len - 1,) - assert input_dict[SampleBatch.T].shape == (self.max_seq_len - 1,) - - # Shift observations - input_dict[SampleBatch.OBS] = np.concatenate( - [ - input_dict[SampleBatch.OBS][1:], - next_obs[None], - ], - axis=0, - ) - - # Shift actions - input_dict[SampleBatch.ACTIONS] = np.concatenate( - [ - input_dict[SampleBatch.ACTIONS][1:], - action[None], - ], - axis=0, - ) - - # Reward is not a sequence, it's only used to calculate next rtg. - input_dict[SampleBatch.REWARDS] = np.asarray(reward) - - # See methods action_distribution_fn and extra_action_out for an explanation - # of why this is done. - input_dict[SampleBatch.RETURNS_TO_GO] = np.concatenate( - [ - input_dict[SampleBatch.RETURNS_TO_GO][1:], - np.asarray(extra[SampleBatch.RETURNS_TO_GO])[None], - ], - axis=0, - ) - - # Shift and increment timesteps - input_dict[SampleBatch.T] = np.concatenate( - [ - input_dict[SampleBatch.T][1:], - input_dict[SampleBatch.T][-1:] + 1, - ], - axis=0, - ) - - return input_dict - - @DeveloperAPI - def get_initial_rtg_tensor( - self, - shape: TensorShape, - dtype: Optional[Type] = torch.float32, - device: Optional["torch.device"] = None, - ): - """Returns a initial/target returns-to-go tensor of the given shape. - - Args: - shape: Shape of the rtg tensor. - dtype: Type of the data in the tensor. Defaults to torch.float32. - device: The device this tensor should be on. Defaults to self.device. - """ - if device is None: - device = self.device - if dtype is None: - device = torch.float32 - - assert self.config["target_return"] is not None, "Must specify target_return." - initial_rtg = torch.full( - shape, - fill_value=self.config["target_return"], - dtype=dtype, - device=device, - ) - return initial_rtg - - @override(TorchPolicyV2) - @DeveloperAPI - def compute_actions( - self, - *args, - **kwargs, - ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]: - raise ValueError("Please use compute_actions_from_input_dict instead.") - - @override(TorchPolicyV2) - def compute_actions_from_input_dict( - self, - input_dict: Union[SampleBatch, Dict[str, TensorStructType]], - explore: bool = None, - timestep: Optional[int] = None, - **kwargs, - ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: - """ - Args: - input_dict: input_dict (that contains a batch dimension for each value). - Keys and shapes: { - OBS: [batch_size, max_seq_len, obs_dim], - ACTIONS: [batch_size, max_seq_len - 1, act_dim], - RETURNS_TO_GO: [batch_size, max_seq_len - 1], - REWARDS: [batch_size], - TIMESTEPS: [batch_size, max_seq_len - 1], - } - explore: unused. - timestep: unused. - Returns: - A tuple consisting of a) actions, b) state_out, c) extra_fetches. - """ - with torch.no_grad(): - # Pass lazy (torch) tensor dict to Model as `input_dict`. - input_dict = input_dict.copy() - input_dict = self._lazy_tensor_dict(input_dict) - input_dict.set_training(True) - - actions, state_out, extra_fetches = self._compute_action_helper(input_dict) - return actions, state_out, extra_fetches - - # TODO: figure out what this with_lock does and why it's only on the helper method. - @with_lock - @override(TorchPolicyV2) - def _compute_action_helper(self, input_dict): - # Switch to eval mode. - self.model.eval() - - batch_size = input_dict[SampleBatch.OBS].shape[0] - - # NOTE: This is probably the most confusing part of the code, made to work with - # env_runner and SimpleListCollector during evaluation, and thus should - # be changed for the new Policy and Connector API. - # So I'll explain how it works. - - # Add current timestep (+1 because -1 is first observation) - # NOTE: ViewRequirement of timestep is -(max_seq_len-2):0. - # The wierd limits is because RLlib treats initial obs as time -1, - # and then 0 is (act, rew, next_obs), etc. - # So we only collect max_seq_len-1 from the rollout and create the current - # step here by adding 1. - # Decision transformer treats initial observation as timestep 0, giving us - # 0 is (obs, act, rew). - timesteps = input_dict[SampleBatch.T] - new_timestep = timesteps[:, -1:] + 1 - input_dict[SampleBatch.T] = torch.cat([timesteps, new_timestep], dim=1) - - # mask out any padded value at start of rollout - # NOTE: the other reason for doing this is that evaluation rollout front - # pads timesteps with -1, so using this we can find out when we need to mask - # out the front section of the batch. - input_dict[SampleBatch.ATTENTION_MASKS] = torch.where( - input_dict[SampleBatch.T] >= 0, 1.0, 0.0 - ) - - # Remove out-of-bound -1 timesteps after attention mask is calculated - uncliped_timesteps = input_dict[SampleBatch.T] - input_dict[SampleBatch.T] = torch.where( - uncliped_timesteps < 0, - torch.zeros_like(uncliped_timesteps), - uncliped_timesteps, - ) - - # Computes returns-to-go. - # NOTE: There are two rtg calculations: updated_rtg and initial_rtg. - # updated_rtg takes the previous rtg value (the ViewRequirement is - # -(max_seq_len-1):-1), and subtracts the last reward from it. - rtg = input_dict[SampleBatch.RETURNS_TO_GO] - last_rtg = rtg[:, -1] - last_reward = input_dict[SampleBatch.REWARDS] - updated_rtg = last_rtg - last_reward - # initial_rtg simply is filled with target_return. - # These two are both only for the current timestep. - initial_rtg = self.get_initial_rtg_tensor( - (batch_size, 1), dtype=rtg.dtype, device=rtg.device - ) - - # Then based on whether we are currently at the first timestep or not - # we use the initial_rtg or updated_rtg. - new_rtg = torch.where(new_timestep == 0, initial_rtg, updated_rtg[:, None]) - # Append the new_rtg to the batch. - input_dict[SampleBatch.RETURNS_TO_GO] = torch.cat([rtg, new_rtg], dim=1)[ - ..., None - ] - - # Pad current action (is not actually attended to and used during inference) - past_actions = input_dict[SampleBatch.ACTIONS] - action_pad = torch.zeros( - (batch_size, 1, *past_actions.shape[2:]), - dtype=past_actions.dtype, - device=past_actions.device, - ) - input_dict[SampleBatch.ACTIONS] = torch.cat([past_actions, action_pad], dim=1) - - # Run inference on model - model_out, _ = self.model(input_dict) # noop, just returns obs. - preds = self.model.get_prediction(model_out, input_dict) - dist_inputs = preds[SampleBatch.ACTIONS][:, -1] - - # Get the actions from the action_dist. - action_dist = self.dist_class(dist_inputs, self.model) - actions = action_dist.deterministic_sample() - - # This is used by env_runner and is actually how it adds custom keys to - # SimpleListCollector and allows ViewRequirements to work. - # This is also used in user inference in get_next_input_dict, which takes - # this output as one of the input. - extra_fetches = { - # new_rtg still has the leftover extra 3rd dimension for inference - SampleBatch.RETURNS_TO_GO: new_rtg.squeeze(-1), - SampleBatch.ACTION_DIST_INPUTS: dist_inputs, - } - - # Update our global timestep by the batch size. - self.global_timestep += len(input_dict[SampleBatch.CUR_OBS]) - - return convert_to_numpy((actions, [], extra_fetches)) - - @override(TorchPolicyV2) - def loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - """Loss function. - - Args: - model: The ModelV2 to run foward pass on. - dist_class: The distribution of this policy. - train_batch: Training SampleBatch. - Keys and shapes: { - OBS: [batch_size, max_seq_len, obs_dim], - ACTIONS: [batch_size, max_seq_len, act_dim], - RETURNS_TO_GO: [batch_size, max_seq_len + 1, 1], - TIMESTEPS: [batch_size, max_seq_len], - ATTENTION_MASKS: [batch_size, max_seq_len], - } - Returns: - Loss scalar tensor. - """ - train_batch = self._lazy_tensor_dict(train_batch) - - # policy forward and get prediction - model_out, _ = self.model(train_batch) # noop, just returns obs. - preds = self.model.get_prediction(model_out, train_batch) - - # get the regression targets - targets = self.model.get_targets(model_out, train_batch) - - # get the attention masks for masked-loss - masks = train_batch[SampleBatch.ATTENTION_MASKS] - - # compute loss - loss = self._masked_loss(preds, targets, masks) - - self.log("cur_lr", torch.tensor(self.cur_lr)) - - return loss - - def _masked_loss(self, preds, targets, masks): - losses = [] - for key in targets: - assert ( - key in preds - ), "for target {key} there is no prediction from the output of the model" - loss_coef = self.config.get(f"loss_coef_{key}", 1.0) - if self._is_discrete(key): - loss = loss_coef * self._masked_cross_entropy_loss( - preds[key], targets[key], masks - ) - else: - loss = loss_coef * self._masked_mse_loss( - preds[key], targets[key], masks - ) - - losses.append(loss) - self.log(f"{key}_loss", loss) - - return sum(losses) - - def _is_discrete(self, key): - return key == SampleBatch.ACTIONS and isinstance(self.action_space, Discrete) - - def _masked_cross_entropy_loss( - self, - preds: TensorType, - targets: TensorType, - masks: TensorType, - ) -> TensorType: - """Computes cross-entropy loss between preds and targets, subject to a mask. - - Args: - preds: logits of shape [B1, ..., Bn, M] - targets: index targets for preds of shape [B1, ..., Bn] - masks: 0 means don't compute loss, 1 means compute loss - shape [B1, ..., Bn] - - Returns: - Scalar cross entropy loss. - """ - losses = F.cross_entropy( - preds.reshape(-1, preds.shape[-1]), - targets.reshape(-1).long(), - reduction="none", - ) - losses = losses * masks.reshape(-1) - return losses.mean() - - def _masked_mse_loss( - self, - preds: TensorType, - targets: TensorType, - masks: TensorType, - ) -> TensorType: - """Computes MSE loss between preds and targets, subject to a mask. - - Args: - preds: logits of shape [B1, ..., Bn, M] - targets: index targets for preds of shape [B1, ..., Bn] - masks: 0 means don't compute loss, 1 means compute loss - shape [B1, ..., Bn] - - Returns: - Scalar cross entropy loss. - """ - losses = F.mse_loss(preds, targets, reduction="none") - losses = losses * masks.reshape( - *masks.shape, *([1] * (len(preds.shape) - len(masks.shape))) - ) - return losses.mean() - - @override(TorchPolicyV2) - def extra_grad_process(self, local_optimizer, loss): - return apply_grad_clipping(self, local_optimizer, loss) - - def log(self, key, value): - # internal log function - self.model.tower_stats[key] = value - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - stats_dict = { - k: torch.stack(self.get_tower_stats(k)).mean().item() - for k in self.model.tower_stats - } - return stats_dict diff --git a/rllib_contrib/dt/src/rllib_dt/dt/segmentation_buffer.py b/rllib_contrib/dt/src/rllib_dt/dt/segmentation_buffer.py deleted file mode 100644 index 658f54c57ab5e..0000000000000 --- a/rllib_contrib/dt/src/rllib_dt/dt/segmentation_buffer.py +++ /dev/null @@ -1,211 +0,0 @@ -import logging -import random -from collections import defaultdict -from typing import List - -import numpy as np - -from ray.rllib.evaluation.postprocessing import discount_cumsum -from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch, concat_samples -from ray.rllib.utils.typing import SampleBatchType - -logger = logging.getLogger(__name__) - - -def front_pad_with_zero(arr: np.ndarray, max_seq_len: int): - """Pad arr on the front/left with 0 up to max_seq_len.""" - length = arr.shape[0] - pad_length = max_seq_len - length - if pad_length > 0: - return np.concatenate( - [np.zeros((pad_length, *arr.shape[1:]), dtype=arr.dtype), arr], axis=0 - ) - else: - return arr - - -class SegmentationBuffer: - """A minimal replay buffer used by Decision Transformer (DT) - to process episodes into max_seq_len length segments and do shuffling. - """ - - def __init__( - self, - capacity: int = 20, - max_seq_len: int = 20, - max_ep_len: int = 1000, - ): - """ - Args: - capacity: Maximum number of episodes the buffer can store. - max_seq_len: Length of segments that are sampled. - max_ep_len: Maximum length of episodes added. - """ - self.capacity = capacity - self.max_seq_len = max_seq_len - self.max_ep_len = max_ep_len - - self._buffer: List[SampleBatch] = [] - - def add(self, batch: SampleBatch): - """Add a SampleBatch of episodes. Replace if full. - - Args: - batch: SampleBatch of full episodes. - """ - episodes = batch.split_by_episode(key=SampleBatch.DONES) - for episode in episodes: - self._add_single_episode(episode) - - def _add_single_episode(self, episode: SampleBatch): - ep_len = episode.env_steps() - - if ep_len > self.max_ep_len: - raise ValueError( - f"The maximum rollout length is {self.max_ep_len} but we tried to add a" - f"rollout of {episode.env_steps()} steps to the SegmentationBuffer." - ) - - # compute returns to go - rewards = episode[SampleBatch.REWARDS].reshape(-1) - rtg = discount_cumsum(rewards, 1.0) - # rtg needs to be one longer than the rest for return targets during training. - rtg = np.concatenate([rtg, np.zeros((1,), dtype=np.float32)], axis=0) - episode[SampleBatch.RETURNS_TO_GO] = rtg[:, None] - - # Add timesteps and masks - episode[SampleBatch.T] = np.arange(ep_len, dtype=np.int32) - episode[SampleBatch.ATTENTION_MASKS] = np.ones(ep_len, dtype=np.float32) - - # Add to the buffer. - if len(self._buffer) < self.capacity: - self._buffer.append(episode) - else: - # TODO: add config for sampling and eviction policies. - replace_ind = random.randint(0, self.capacity - 1) - self._buffer[replace_ind] = episode - - def sample(self, batch_size: int) -> SampleBatch: - """Sample segments from the buffer. - - Args: - batch_size: number of segments to sample. - - Returns: - SampleBatch of segments with keys and shape { - OBS: [batch_size, max_seq_len, obs_dim], - ACTIONS: [batch_size, max_seq_len, act_dim], - RETURNS_TO_GO: [batch_size, max_seq_len + 1, 1], - T: [batch_size, max_seq_len], - ATTENTION_MASKS: [batch_size, max_seq_len], - } - """ - samples = [self._sample_single() for _ in range(batch_size)] - return concat_samples(samples) - - def _sample_single(self) -> SampleBatch: - # TODO: sample proportional to episode length - # Sample a random episode from the buffer and then sample a random - # segment from that episode. - buffer_ind = random.randint(0, len(self._buffer) - 1) - - episode = self._buffer[buffer_ind] - ep_len = episode[SampleBatch.OBS].shape[0] - - # ei (end index) is exclusive - ei = random.randint(1, ep_len) - # si (start index) is inclusive - si = max(ei - self.max_seq_len, 0) - - # Slice segments from obs, actions, timesteps, and rtgs - obs = episode[SampleBatch.OBS][si:ei] - actions = episode[SampleBatch.ACTIONS][si:ei] - timesteps = episode[SampleBatch.T][si:ei] - masks = episode[SampleBatch.ATTENTION_MASKS][si:ei] - # Note that returns-to-go needs an extra elem as the rtg target for the last - # action token passed into the transformer. - returns_to_go = episode[SampleBatch.RETURNS_TO_GO][si : ei + 1] - - # Front-pad if we're at the beginning of the episode and we need more tokens - # to pass into the transformer. Or if the episode length is shorter - # than max_seq_len. - obs = front_pad_with_zero(obs, self.max_seq_len) - actions = front_pad_with_zero(actions, self.max_seq_len) - returns_to_go = front_pad_with_zero(returns_to_go, self.max_seq_len + 1) - timesteps = front_pad_with_zero(timesteps, self.max_seq_len) - masks = front_pad_with_zero(masks, self.max_seq_len) - - assert obs.shape[0] == self.max_seq_len - assert actions.shape[0] == self.max_seq_len - assert timesteps.shape[0] == self.max_seq_len - assert masks.shape[0] == self.max_seq_len - assert returns_to_go.shape[0] == self.max_seq_len + 1 - - return SampleBatch( - { - SampleBatch.OBS: obs[None], - SampleBatch.ACTIONS: actions[None], - SampleBatch.RETURNS_TO_GO: returns_to_go[None], - SampleBatch.T: timesteps[None], - SampleBatch.ATTENTION_MASKS: masks[None], - } - ) - - -class MultiAgentSegmentationBuffer: - """A minimal replay buffer used by Decision Transformer (DT) - to process episodes into max_seq_len length segments and do shuffling. - Stores MultiAgentSample. - """ - - def __init__( - self, - capacity: int = 20, - max_seq_len: int = 20, - max_ep_len: int = 1000, - ): - """ - Args: - capacity: Maximum number of episodes the buffer can store. - max_seq_len: Length of segments that are sampled. - max_ep_len: Maximum length of episodes added. - """ - - def new_buffer(): - return SegmentationBuffer(capacity, max_seq_len, max_ep_len) - - self.buffers = defaultdict(new_buffer) - - def add(self, batch: SampleBatchType): - """Add a MultiAgentBatch of episodes. Replace if full. - - Args: - batch: MultiAgentBatch of full episodes. - """ - # Make a copy so the replay buffer doesn't pin plasma memory. - batch = batch.copy() - # Handle everything as if multi-agent. - batch = batch.as_multi_agent() - - for policy_id, sample_batch in batch.policy_batches.items(): - self.buffers[policy_id].add(sample_batch) - - def sample(self, batch_size: int) -> MultiAgentBatch: - """Sample segments from the buffer. - - Args: - batch_size: number of segments to sample. - - Returns: - MultiAgentBatch of segments with keys and shape { - OBS: [batch_size, max_seq_len, obs_dim], - ACTIONS: [batch_size, max_seq_len, act_dim], - RETURNS_TO_GO: [batch_size, max_seq_len + 1, 1], - T: [batch_size, max_seq_len], - ATTENTION_MASKS: [batch_size, max_seq_len], - } - """ - samples = {} - for policy_id, buffer in self.buffers.items(): - samples[policy_id] = buffer.sample(batch_size) - return MultiAgentBatch(samples, batch_size) diff --git a/rllib_contrib/dt/tests/test_dt.py b/rllib_contrib/dt/tests/test_dt.py deleted file mode 100644 index 46a68e437708e..0000000000000 --- a/rllib_contrib/dt/tests/test_dt.py +++ /dev/null @@ -1,269 +0,0 @@ -import unittest -from typing import Dict - -import gymnasium as gym -import numpy as np -from rllib_dt.dt.dt import DTConfig - -import ray -from ray.rllib import SampleBatch -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import check_train_results - -tf1, tf, tfv = try_import_tf() -torch, _ = try_import_torch() - - -def _assert_input_dict_equals(d1: Dict[str, np.ndarray], d2: Dict[str, np.ndarray]): - for key in d1.keys(): - assert key in d2.keys() - - for key in d2.keys(): - assert key in d1.keys() - - for key in d1.keys(): - assert isinstance(d1[key], np.ndarray) - assert isinstance(d2[key], np.ndarray) - assert d1[key].shape == d2[key].shape - assert np.allclose(d1[key], d2[key]) - - -class TestDT(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_dt_compilation(self): - """Test whether a DT algorithm can be built with all supported frameworks.""" - - config = ( - DTConfig() - .environment( - env="Pendulum-v1", - clip_actions=True, - normalize_actions=True, - ) - .framework("torch") - .offline_data( - input_="dataset", - input_config={ - "format": "json", - "paths": [ - "s3://anonymous@air-example-data/rllib/pendulum/large.json" - ], - }, - actions_in_input_normalized=True, - ) - .training( - train_batch_size=200, - replay_buffer_config={ - "capacity": 8, - }, - model={ - "max_seq_len": 4, - }, - num_layers=1, - num_heads=1, - embed_dim=64, - horizon=200, - ) - .evaluation( - target_return=-120, - evaluation_interval=2, - evaluation_num_workers=0, - evaluation_duration=10, - evaluation_duration_unit="episodes", - evaluation_parallel_to_training=False, - evaluation_config=DTConfig.overrides(input_="sampler", explore=False), - ) - .rollouts( - num_rollout_workers=0, - ) - .reporting( - min_train_timesteps_per_iteration=10, - ) - .experimental( - _disable_preprocessor_api=True, - ) - ) - - num_iterations = 4 - - for _ in ["torch"]: - algo = config.build() - # check if 4 iterations raises any errors - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - if (i + 1) % 2 == 0: - # evaluation happens every 2 iterations - eval_results = results["evaluation"] - print( - f"iter={algo.iteration} " - f"R={eval_results['episode_reward_mean']}" - ) - - # do example inference rollout - env = gym.make("Pendulum-v1") - - obs, _ = env.reset() - input_dict = algo.get_initial_input_dict(obs) - - for _ in range(200): - action, _, extra = algo.compute_single_action(input_dict=input_dict) - obs, reward, terminated, truncated, _ = env.step(action) - if terminated or truncated: - break - else: - input_dict = algo.get_next_input_dict( - input_dict, - action, - reward, - obs, - extra, - ) - - env.close() - algo.stop() - - def test_inference_methods(self): - """Test inference methods.""" - - config = ( - DTConfig() - .environment( - env="Pendulum-v1", - clip_actions=True, - normalize_actions=True, - ) - .framework("torch") - .training( - train_batch_size=200, - replay_buffer_config={ - "capacity": 8, - }, - model={ - "max_seq_len": 3, - }, - num_layers=1, - num_heads=1, - embed_dim=64, - horizon=200, - ) - .evaluation( - target_return=-120, - ) - .rollouts( - num_rollout_workers=0, - ) - .experimental(_disable_preprocessor_api=True) - ) - algo = config.build() - - # Do a controlled fake rollout for 2 steps and check input_dict - # first input_dict - obs = np.array([0.0, 1.0, 2.0]) - - input_dict = algo.get_initial_input_dict(obs) - target = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 1.0, 2.0], - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: np.array([[0.0], [0.0]], dtype=np.float32), - SampleBatch.RETURNS_TO_GO: np.array([0.0, 0.0], dtype=np.float32), - SampleBatch.REWARDS: np.zeros((), dtype=np.float32), - SampleBatch.T: np.array([-1, -1], dtype=np.int32), - } - ) - _assert_input_dict_equals(input_dict, target) - - # forward pass with first input_dict - action, _, extra = algo.compute_single_action(input_dict=input_dict) - assert action.shape == (1,) - assert SampleBatch.RETURNS_TO_GO in extra - assert np.isclose(extra[SampleBatch.RETURNS_TO_GO], -120.0) - - # second input_dict - action = np.array([0.5]) - obs = np.array([3.0, 4.0, 5.0]) - reward = -10.0 - - input_dict = algo.get_next_input_dict( - input_dict, - action, - reward, - obs, - extra, - ) - target = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [0.0, 0.0, 0.0], - [0.0, 1.0, 2.0], - [3.0, 4.0, 5.0], - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: np.array([[0.0], [0.5]], dtype=np.float32), - SampleBatch.RETURNS_TO_GO: np.array([0.0, -120.0], dtype=np.float32), - SampleBatch.REWARDS: np.asarray(-10.0), - SampleBatch.T: np.array([-1, 0], dtype=np.int32), - } - ) - _assert_input_dict_equals(input_dict, target) - - # forward pass with second input_dict - action, _, extra = algo.compute_single_action(input_dict=input_dict) - assert action.shape == (1,) - assert SampleBatch.RETURNS_TO_GO in extra - assert np.isclose(extra[SampleBatch.RETURNS_TO_GO], -110.0) - - # third input_dict - action = np.array([-0.2]) - obs = np.array([6.0, 7.0, 8.0]) - reward = -20.0 - - input_dict = algo.get_next_input_dict( - input_dict, - action, - reward, - obs, - extra, - ) - target = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [0.0, 1.0, 2.0], - [3.0, 4.0, 5.0], - [6.0, 7.0, 8.0], - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: np.array([[0.5], [-0.2]], dtype=np.float32), - SampleBatch.RETURNS_TO_GO: np.array([-120, -110.0], dtype=np.float32), - SampleBatch.REWARDS: np.asarray(-20.0), - SampleBatch.T: np.array([0, 1], dtype=np.int32), - } - ) - _assert_input_dict_equals(input_dict, target) - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/dt/tests/test_dt_model.py b/rllib_contrib/dt/tests/test_dt_model.py deleted file mode 100644 index 4ebecff5091f7..0000000000000 --- a/rllib_contrib/dt/tests/test_dt_model.py +++ /dev/null @@ -1,302 +0,0 @@ -import unittest - -import gymnasium as gym -import numpy as np -from rllib_dt.dt.dt_torch_model import DTTorchModel - -import ray -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.torch_utils import convert_to_torch_tensor - -tf1, tf, tfv = try_import_tf() -torch, _ = try_import_torch() - - -def _assert_outputs_equal(outputs): - for i in range(1, len(outputs)): - for key in outputs[0].keys(): - assert np.allclose( - outputs[0][key], outputs[i][key] - ), "outputs are different but they shouldn't be." - - -def _assert_outputs_not_equal(outputs): - for i in range(1, len(outputs)): - for key in outputs[0].keys(): - assert not np.allclose( - outputs[0][key], outputs[i][key] - ), "some outputs are the same but they shouldn't be." - - -def _generate_input_dict(B, T, obs_space, action_space): - """Generate input_dict that has completely fake values.""" - # generate deterministic inputs - # obs - obs = np.arange(B * T * obs_space.shape[0], dtype=np.float32).reshape( - (B, T, obs_space.shape[0]) - ) - # actions - if isinstance(action_space, gym.spaces.Box): - act = np.arange(B * T * action_space.shape[0], dtype=np.float32).reshape( - (B, T, action_space.shape[0]) - ) - else: - act = np.mod(np.arange(B * T, dtype=np.int32).reshape((B, T)), action_space.n) - # returns to go - rtg = np.arange(B * (T + 1), dtype=np.float32).reshape((B, T + 1, 1)) - # timesteps - timesteps = np.stack([np.arange(T, dtype=np.int32) for _ in range(B)], axis=0) - # attention mask - mask = np.ones((B, T), dtype=np.float32) - - input_dict = SampleBatch( - { - SampleBatch.OBS: obs, - SampleBatch.ACTIONS: act, - SampleBatch.RETURNS_TO_GO: rtg, - SampleBatch.T: timesteps, - SampleBatch.ATTENTION_MASKS: mask, - } - ) - input_dict = convert_to_torch_tensor(input_dict) - return input_dict - - -class TestDTModel(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_torch_model_init(self): - """Test models are initialized properly""" - model_config = { - "embed_dim": 32, - "num_layers": 2, - "max_seq_len": 4, - "max_ep_len": 10, - "num_heads": 2, - "embed_pdrop": 0.1, - "resid_pdrop": 0.1, - "attn_pdrop": 0.1, - "use_obs_output": False, - "use_return_output": False, - } - - num_outputs = 2 - observation_space = gym.spaces.Box(-1.0, 1.0, shape=(num_outputs,)) - - action_dim = 5 - action_spaces = [ - gym.spaces.Box(-1.0, 1.0, shape=(action_dim,)), - gym.spaces.Discrete(action_dim), - ] - - B, T = 3, 4 - - for action_space in action_spaces: - # Generate input dict. - input_dict = _generate_input_dict(B, T, observation_space, action_space) - - # Do random initialization a few times and make sure outputs are different - outputs = [] - for _ in range(10): - model = DTTorchModel( - observation_space, - action_space, - num_outputs, - model_config, - "model", - ) - # so dropout is not in effect - model.eval() - model_out, _ = model(input_dict) - output = model.get_prediction(model_out, input_dict) - outputs.append(convert_to_numpy(output)) - _assert_outputs_not_equal(outputs) - - # Initialize once and make sure dropout is working - model = DTTorchModel( - observation_space, - action_space, - num_outputs, - model_config, - "model", - ) - - # Dropout should make outputs different in training mode - model.train() - outputs = [] - for _ in range(10): - model_out, _ = model(input_dict) - output = model.get_prediction(model_out, input_dict) - outputs.append(convert_to_numpy(output)) - _assert_outputs_not_equal(outputs) - - # Dropout should make outputs the same in eval mode - model.eval() - outputs = [] - for _ in range(10): - model_out, _ = model(input_dict) - output = model.get_prediction(model_out, input_dict) - outputs.append(convert_to_numpy(output)) - _assert_outputs_equal(outputs) - - def test_torch_model_prediction_target(self): - """Test the get_prediction and get_targets function.""" - model_config = { - "embed_dim": 16, - "num_layers": 3, - "max_seq_len": 3, - "max_ep_len": 9, - "num_heads": 1, - "embed_pdrop": 0.2, - "resid_pdrop": 0.2, - "attn_pdrop": 0.2, - "use_obs_output": True, - "use_return_output": True, - } - - num_outputs = 5 - observation_space = gym.spaces.Box(-1.0, 1.0, shape=(num_outputs,)) - - action_dim = 2 - action_spaces = [ - gym.spaces.Box(-1.0, 1.0, shape=(action_dim,)), - gym.spaces.Discrete(action_dim), - ] - - B, T = 2, 3 - - for action_space in action_spaces: - # Generate input dict. - input_dict = _generate_input_dict(B, T, observation_space, action_space) - - # Make model and forward pass. - model = DTTorchModel( - observation_space, - action_space, - num_outputs, - model_config, - "model", - ) - model_out, _ = model(input_dict) - preds = model.get_prediction(model_out, input_dict) - target = model.get_targets(model_out, input_dict) - - preds = convert_to_numpy(preds) - target = convert_to_numpy(target) - - # Test the content and shape of output and target - if isinstance(action_space, gym.spaces.Box): - # test preds shape - self.assertEqual(preds[SampleBatch.ACTIONS].shape, (B, T, action_dim)) - # test target shape and content - self.assertEqual(target[SampleBatch.ACTIONS].shape, (B, T, action_dim)) - assert np.allclose( - target[SampleBatch.ACTIONS], - input_dict[SampleBatch.ACTIONS], - ) - else: - # test preds shape - self.assertEqual(preds[SampleBatch.ACTIONS].shape, (B, T, action_dim)) - # test target shape and content - self.assertEqual(target[SampleBatch.ACTIONS].shape, (B, T)) - assert np.allclose( - target[SampleBatch.ACTIONS], - input_dict[SampleBatch.ACTIONS], - ) - - # test preds shape - self.assertEqual(preds[SampleBatch.OBS].shape, (B, T, num_outputs)) - # test target shape and content - self.assertEqual(target[SampleBatch.OBS].shape, (B, T, num_outputs)) - assert np.allclose( - target[SampleBatch.OBS], - input_dict[SampleBatch.OBS], - ) - - # test preds shape - self.assertEqual(preds[SampleBatch.RETURNS_TO_GO].shape, (B, T, 1)) - # test target shape and content - self.assertEqual(target[SampleBatch.RETURNS_TO_GO].shape, (B, T, 1)) - assert np.allclose( - target[SampleBatch.RETURNS_TO_GO], - input_dict[SampleBatch.RETURNS_TO_GO][:, 1:, :], - ) - - def test_causal_masking(self): - """Test that the transformer model' causal masking works.""" - model_config = { - "embed_dim": 16, - "num_layers": 2, - "max_seq_len": 4, - "max_ep_len": 10, - "num_heads": 2, - "embed_pdrop": 0, - "resid_pdrop": 0, - "attn_pdrop": 0, - "use_obs_output": True, - "use_return_output": True, - } - - observation_space = gym.spaces.Box(-1.0, 1.0, shape=(4,)) - action_space = gym.spaces.Box(-1.0, 1.0, shape=(2,)) - B = 2 - T = model_config["max_seq_len"] - - # Generate input dict. - input_dict = _generate_input_dict(B, T, observation_space, action_space) - - # make model and forward with attention - model = DTTorchModel( - observation_space, - action_space, - 4, - model_config, - "model", - ) - model_out, _ = model(input_dict) - preds = model.get_prediction(model_out, input_dict, return_attentions=True) - preds = convert_to_numpy(preds) - - # test properties of attentions - attentions = preds["attentions"] - self.assertEqual( - len(attentions), - model_config["num_layers"], - "there should as many attention tensors as layers.", - ) - - # used to select the causal padded element of each attention tensor - select_mask = np.triu(np.ones((3 * T, 3 * T), dtype=np.bool_), k=1) - select_mask = np.tile(select_mask, (B, model_config["num_heads"], 1, 1)) - - for attention in attentions: - # check shape - self.assertEqual( - attention.shape, (B, model_config["num_heads"], T * 3, T * 3) - ) - # check the upper triangular masking - assert np.allclose( - attention[select_mask], 0.0 - ), "masked elements should be zero." - # check that the non-masked elements have non 0 scores - # Note: it is very unlikely that randomly initialized weights will make - # one of the scores be 0, as these scores are probabilities. - assert not np.any( - np.isclose(attention[np.logical_not(select_mask)], 0.0) - ), "non masked elements should be nonzero." - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/dt/tests/test_dt_policy.py b/rllib_contrib/dt/tests/test_dt_policy.py deleted file mode 100644 index 922be2ba21a4c..0000000000000 --- a/rllib_contrib/dt/tests/test_dt_policy.py +++ /dev/null @@ -1,513 +0,0 @@ -import unittest -from typing import Dict - -import gymnasium as gym -import numpy as np -from rllib_dt.dt.dt_torch_policy import DTTorchPolicy - -import ray -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.framework import try_import_tf, try_import_torch - -tf1, tf, tfv = try_import_tf() -torch, nn = try_import_torch() - - -def _default_config(): - """Base config to use.""" - return { - "model": { - "max_seq_len": 4, - }, - "embed_dim": 32, - "num_layers": 2, - "horizon": 10, - "num_heads": 2, - "embed_pdrop": 0.1, - "resid_pdrop": 0.1, - "attn_pdrop": 0.1, - "framework": "torch", - "lr": 1e-3, - "lr_schedule": None, - "optimizer": { - "weight_decay": 1e-4, - "betas": [0.9, 0.99], - }, - "target_return": 200.0, - "loss_coef_actions": 1.0, - "loss_coef_obs": 0, - "loss_coef_returns_to_go": 0, - "num_gpus": 0, - "_fake_gpus": None, - "_enable_new_api_stack": False, - } - - -def _assert_input_dict_equals(d1: Dict[str, np.ndarray], d2: Dict[str, np.ndarray]): - for key in d1.keys(): - assert key in d2.keys() - - for key in d2.keys(): - assert key in d1.keys() - - for key in d1.keys(): - assert isinstance(d1[key], np.ndarray), "input_dict should only be numpy array." - assert isinstance(d2[key], np.ndarray), "input_dict should only be numpy array." - assert d1[key].shape == d2[key].shape, "input_dict are of different shape." - assert np.allclose(d1[key], d2[key]), "input_dict values are not equal." - - -class TestDTPolicy(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_torch_postprocess_trajectory(self): - """Test postprocess_trajectory""" - config = _default_config() - - observation_space = gym.spaces.Box(-1.0, 1.0, shape=(4,)) - action_space = gym.spaces.Box(-1.0, 1.0, shape=(3,)) - - # Create policy - policy = DTTorchPolicy(observation_space, action_space, config) - - # Generate input_dict with some data - sample_batch = SampleBatch( - { - SampleBatch.REWARDS: np.array([1.0, 2.0, 1.0, 1.0]), - SampleBatch.EPS_ID: np.array([0, 0, 0, 0]), - } - ) - - # Do postprocess trajectory to calculate rtg. - sample_batch = policy.postprocess_trajectory(sample_batch) - - # Assert that terminateds and truncateds are correctly set. - assert ( - SampleBatch.TERMINATEDS in sample_batch - ), "`terminateds` isn't part of the batch." - assert ( - SampleBatch.TRUNCATEDS not in sample_batch - ), "`truncateds` shouldn't be part of the batch (in this particular test case)." - assert np.allclose( - sample_batch[SampleBatch.TERMINATEDS], - np.array([False, False, False, True]), - ), "`terminateds` isn't set correctly." - - def test_torch_input_dict(self): - """Test inference input_dict methods - - This is a minimal version the test in test_dt.py. - The shapes of the input_dict might be confusing but it makes sense in - context of what the function is supposed to do. - Check action_distribution_fn for an explanation. - """ - config = _default_config() - - observation_space = gym.spaces.Box(-1.0, 1.0, shape=(3,)) - action_spaces = [ - gym.spaces.Box(-1.0, 1.0, shape=(1,)), - gym.spaces.Discrete(4), - ] - - for action_space in action_spaces: - # Create policy - policy = DTTorchPolicy(observation_space, action_space, config) - - # initial obs and input_dict - obs = np.array([0.0, 1.0, 2.0]) - input_dict = policy.get_initial_input_dict(obs) - - # Check input_dict matches what it should be - target_input_dict = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 1.0, 2.0], - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: ( - np.array([[0.0], [0.0], [0.0]], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.array([0, 0, 0], dtype=np.int32) - ), - SampleBatch.RETURNS_TO_GO: np.array( - [0.0, 0.0, 0.0], dtype=np.float32 - ), - SampleBatch.REWARDS: np.zeros((), dtype=np.float32), - SampleBatch.T: np.array([-1, -1, -1], dtype=np.int32), - } - ) - _assert_input_dict_equals(input_dict, target_input_dict) - - # Get next input_dict - input_dict = policy.get_next_input_dict( - input_dict, - action=( - np.asarray([1.0], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.asarray(1, dtype=np.int32) - ), - reward=1.0, - next_obs=np.array([3.0, 4.0, 5.0]), - extra={ - SampleBatch.RETURNS_TO_GO: config["target_return"], - }, - ) - - # Check input_dict matches what it should be - target_input_dict = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 1.0, 2.0], - [3.0, 4.0, 5.0], - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: ( - np.array([[0.0], [0.0], [1.0]], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.array([0, 0, 1], dtype=np.int32) - ), - SampleBatch.RETURNS_TO_GO: np.array( - [0.0, 0.0, config["target_return"]], dtype=np.float32 - ), - SampleBatch.REWARDS: np.asarray(1.0, dtype=np.float32), - SampleBatch.T: np.array([-1, -1, 0], dtype=np.int32), - } - ) - _assert_input_dict_equals(input_dict, target_input_dict) - - def test_torch_action(self): - """Test policy's action_distribution_fn and extra_action_out methods by - calling compute_actions_from_input_dict which works those two methods - in conjunction. - """ - config = _default_config() - - observation_space = gym.spaces.Box(-1.0, 1.0, shape=(3,)) - action_spaces = [ - gym.spaces.Box(-1.0, 1.0, shape=(1,)), - gym.spaces.Discrete(4), - ] - - for action_space in action_spaces: - # Create policy - policy = DTTorchPolicy(observation_space, action_space, config) - - # input_dict for initial observation - input_dict = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [ - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 1.0, 2.0], - ] - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: ( - np.array([[[0.0], [0.0], [0.0]]], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.array([[0, 0, 0]], dtype=np.int32) - ), - SampleBatch.RETURNS_TO_GO: np.array( - [[0.0, 0.0, 0.0]], dtype=np.float32 - ), - SampleBatch.REWARDS: np.array([0.0], dtype=np.float32), - SampleBatch.T: np.array([[-1, -1, -1]], dtype=np.int32), - } - ) - - # Run compute_actions_from_input_dict - actions, _, extras = policy.compute_actions_from_input_dict( - input_dict, - explore=False, - timestep=None, - ) - - # Check actions - assert actions.shape == ( - 1, - *action_space.shape, - ), "actions has incorrect shape." - - # Check extras - assert ( - SampleBatch.RETURNS_TO_GO in extras - ), "extras should contain returns_to_go." - assert extras[SampleBatch.RETURNS_TO_GO].shape == ( - 1, - ), "extras['returns_to_go'] has incorrect shape." - assert np.isclose( - extras[SampleBatch.RETURNS_TO_GO], - np.asarray([config["target_return"]], dtype=np.float32), - ), "extras['returns_to_go'] should contain target_return." - - # input_dict for non-initial observation - input_dict = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [ - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 1.0, 2.0], - [3.0, 4.0, 5.0], - ] - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: ( - np.array([[[0.0], [0.0], [1.0]]], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.array([[0, 0, 1]], dtype=np.int32) - ), - SampleBatch.RETURNS_TO_GO: np.array( - [[0.0, 0.0, config["target_return"]]], dtype=np.float32 - ), - SampleBatch.REWARDS: np.array([10.0], dtype=np.float32), - SampleBatch.T: np.array([[-1, -1, 0]], dtype=np.int32), - } - ) - - # Run compute_actions_from_input_dict - actions, _, extras = policy.compute_actions_from_input_dict( - input_dict, - explore=False, - timestep=None, - ) - - # Check actions - assert actions.shape == ( - 1, - *action_space.shape, - ), "actions has incorrect shape." - - # Check extras - assert ( - SampleBatch.RETURNS_TO_GO in extras - ), "extras should contain returns_to_go." - assert extras[SampleBatch.RETURNS_TO_GO].shape == ( - 1, - ), "extras['returns_to_go'] has incorrect shape." - assert np.isclose( - extras[SampleBatch.RETURNS_TO_GO], - np.asarray([config["target_return"] - 10.0], dtype=np.float32), - ), "extras['returns_to_go'] should contain target_return." - - def test_loss(self): - """Test loss function.""" - config = _default_config() - config["embed_pdrop"] = 0 - config["resid_pdrop"] = 0 - config["attn_pdrop"] = 0 - - observation_space = gym.spaces.Box(-1.0, 1.0, shape=(3,)) - action_spaces = [ - gym.spaces.Box(-1.0, 1.0, shape=(1,)), - gym.spaces.Discrete(4), - ] - - for action_space in action_spaces: - # Create policy - policy = DTTorchPolicy(observation_space, action_space, config) - - # Run loss functions on batches with different items in the mask to make - # sure the masks are working and making the loss the same. - batch1 = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [ - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 1.0, 2.0], - [3.0, 4.0, 5.0], - ] - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: ( - np.array([[[0.0], [0.0], [1.0], [0.5]]], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.array([[0, 0, 1, 3]], dtype=np.int64) - ), - SampleBatch.RETURNS_TO_GO: np.array( - [[[0.0], [0.0], [100.0], [90.0], [80.0]]], dtype=np.float32 - ), - SampleBatch.T: np.array([[0, 0, 0, 1]], dtype=np.int32), - SampleBatch.ATTENTION_MASKS: np.array( - [[0.0, 0.0, 1.0, 1.0]], dtype=np.float32 - ), - } - ) - - batch2 = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [ - [1.0, 1.0, -1.0], - [1.0, 10.0, 12.0], - [0.0, 1.0, 2.0], - [3.0, 4.0, 5.0], - ] - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: ( - np.array([[[1.0], [-0.5], [1.0], [0.5]]], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.array([[2, 1, 1, 3]], dtype=np.int64) - ), - SampleBatch.RETURNS_TO_GO: np.array( - [[[200.0], [-10.0], [100.0], [90.0], [80.0]]], dtype=np.float32 - ), - SampleBatch.T: np.array([[9, 3, 0, 1]], dtype=np.int32), - SampleBatch.ATTENTION_MASKS: np.array( - [[0.0, 0.0, 1.0, 1.0]], dtype=np.float32 - ), - } - ) - - loss1 = policy.loss(policy.model, policy.dist_class, batch1) - loss2 = policy.loss(policy.model, policy.dist_class, batch2) - - loss1 = loss1.detach().cpu().item() - loss2 = loss2.detach().cpu().item() - - assert np.isclose(loss1, loss2), "Masks are not working for losses." - - # Run loss on a widely different batch and make sure the loss is different. - batch3 = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [ - [1.0, 1.0, -20.0], - [0.1, 10.0, 12.0], - [1.4, 12.0, -9.0], - [6.0, 40.0, -2.0], - ] - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: ( - np.array([[[2.0], [-1.5], [0.2], [0.1]]], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.array([[1, 3, 0, 2]], dtype=np.int64) - ), - SampleBatch.RETURNS_TO_GO: np.array( - [[[90.0], [80.0], [70.0], [60.0], [50.0]]], dtype=np.float32 - ), - SampleBatch.T: np.array([[3, 4, 5, 6]], dtype=np.int32), - SampleBatch.ATTENTION_MASKS: np.array( - [[1.0, 1.0, 1.0, 1.0]], dtype=np.float32 - ), - } - ) - - loss3 = policy.loss(policy.model, policy.dist_class, batch3) - loss3 = loss3.detach().cpu().item() - - assert not np.isclose( - loss1, loss3 - ), "Widely different inputs are giving the same loss value." - - def test_loss_coef(self): - """Test the loss_coef_{key} config options.""" - - config = _default_config() - config["embed_pdrop"] = 0 - config["resid_pdrop"] = 0 - config["attn_pdrop"] = 0 - # set initial action coef to 0 - config["loss_coef_actions"] = 0 - - observation_space = gym.spaces.Box(-1.0, 1.0, shape=(3,)) - action_spaces = [ - gym.spaces.Box(-1.0, 1.0, shape=(1,)), - gym.spaces.Discrete(4), - ] - - for action_space in action_spaces: - batch = SampleBatch( - { - SampleBatch.OBS: np.array( - [ - [ - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 1.0, 2.0], - [3.0, 4.0, 5.0], - ] - ], - dtype=np.float32, - ), - SampleBatch.ACTIONS: ( - np.array([[[0.0], [0.0], [1.0], [0.5]]], dtype=np.float32) - if isinstance(action_space, gym.spaces.Box) - else np.array([[0, 0, 1, 3]], dtype=np.int64) - ), - SampleBatch.RETURNS_TO_GO: np.array( - [[[0.0], [0.0], [100.0], [90.0], [80.0]]], dtype=np.float32 - ), - SampleBatch.T: np.array([[0, 0, 0, 1]], dtype=np.int32), - SampleBatch.ATTENTION_MASKS: np.array( - [[0.0, 0.0, 1.0, 1.0]], dtype=np.float32 - ), - } - ) - - keys = [SampleBatch.ACTIONS, SampleBatch.OBS, SampleBatch.RETURNS_TO_GO] - for key in keys: - # create policy and run loss with different coefs - # create policy 1 with coef = 1 - config1 = config.copy() - config1[f"loss_coef_{key}"] = 1.0 - policy1 = DTTorchPolicy(observation_space, action_space, config1) - - loss1 = policy1.loss(policy1.model, policy1.dist_class, batch) - loss1 = loss1.detach().cpu().item() - - # create policy 2 with coef = 10 - config2 = config.copy() - config2[f"loss_coef_{key}"] = 10.0 - policy2 = DTTorchPolicy(observation_space, action_space, config2) - # Copy the weights over so they output the same loss without scaling. - policy2.set_weights(policy1.get_weights()) - - loss2 = policy2.loss(policy2.model, policy2.dist_class, batch) - loss2 = loss2.detach().cpu().item() - - # Compare loss, should be factor of 10 difference. - self.assertAlmostEqual( - loss2 / loss1, - 10.0, - places=3, - msg="the two losses should be different to a factor of 10.", - ) - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/dt/tests/test_segmentation_buffer.py b/rllib_contrib/dt/tests/test_segmentation_buffer.py deleted file mode 100644 index c9036e28ff9a8..0000000000000 --- a/rllib_contrib/dt/tests/test_segmentation_buffer.py +++ /dev/null @@ -1,421 +0,0 @@ -import unittest -from typing import List, Union - -import numpy as np -from rllib_dt.dt.segmentation_buffer import ( - MultiAgentSegmentationBuffer, - SegmentationBuffer, -) - -import ray -from ray.rllib.policy.sample_batch import ( - DEFAULT_POLICY_ID, - MultiAgentBatch, - SampleBatch, - concat_samples, -) -from ray.rllib.utils import test_utils -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.typing import PolicyID - -tf1, tf, tfv = try_import_tf() -torch, _ = try_import_torch() - - -def _generate_episode_batch(ep_len, eps_id, obs_dim=8, act_dim=3): - """Generate a batch containing one episode.""" - # These values are not actually correct as usual. But using eps_id - # as the values allow us to identify them in the tests. - batch = SampleBatch( - { - SampleBatch.OBS: np.full((ep_len, obs_dim), eps_id, dtype=np.float32), - SampleBatch.ACTIONS: np.full( - (ep_len, act_dim), eps_id + 100, dtype=np.float32 - ), - SampleBatch.REWARDS: np.ones((ep_len,), dtype=np.float32), - SampleBatch.RETURNS_TO_GO: np.arange( - ep_len, -1, -1, dtype=np.float32 - ).reshape((ep_len + 1, 1)), - SampleBatch.EPS_ID: np.full((ep_len,), eps_id, dtype=np.int32), - SampleBatch.T: np.arange(ep_len, dtype=np.int32), - SampleBatch.ATTENTION_MASKS: np.ones(ep_len, dtype=np.float32), - SampleBatch.TERMINATEDS: np.array([False] * (ep_len - 1) + [True]), - SampleBatch.TRUNCATEDS: np.array([False] * ep_len), - } - ) - return batch - - -def _assert_sample_batch_keys(batch: SampleBatch): - """Assert sampled batch has the requisite keys.""" - assert SampleBatch.OBS in batch - assert SampleBatch.ACTIONS in batch - assert SampleBatch.RETURNS_TO_GO in batch - assert SampleBatch.T in batch - assert SampleBatch.ATTENTION_MASKS in batch - - -def _assert_sample_batch_not_equal(b1: SampleBatch, b2: SampleBatch): - """Assert that the two batches are not equal.""" - for key in b1.keys() & b2.keys(): - if b1[key].shape == b2[key].shape: - assert not np.allclose( - b1[key], b2[key] - ), f"Key {key} contain the same value when they should not." - - -def _assert_is_segment(segment: SampleBatch, episode: SampleBatch): - """Assert that the sampled segment is a segment of episode.""" - timesteps = segment[SampleBatch.T] - masks = segment[SampleBatch.ATTENTION_MASKS] > 0.5 - seq_len = timesteps.shape[0] - episode_segment = episode.slice(timesteps[0], timesteps[-1] + 1) - assert np.allclose( - segment[SampleBatch.OBS][masks], episode_segment[SampleBatch.OBS] - ) - assert np.allclose( - segment[SampleBatch.ACTIONS][masks], episode_segment[SampleBatch.ACTIONS] - ) - assert np.allclose( - segment[SampleBatch.RETURNS_TO_GO][:seq_len][masks], - episode_segment[SampleBatch.RETURNS_TO_GO], - ) - - -def _get_internal_buffer( - buffer: Union[SegmentationBuffer, MultiAgentSegmentationBuffer], - policy_id: PolicyID = DEFAULT_POLICY_ID, -) -> List[SampleBatch]: - """Get the internal buffer list from the buffer. If MultiAgent then return the - internal buffer corresponding to the given policy_id. - """ - if type(buffer) == SegmentationBuffer: - return buffer._buffer - elif type(buffer) == MultiAgentSegmentationBuffer: - return buffer.buffers[policy_id]._buffer - else: - raise NotImplementedError - - -def _as_sample_batch( - batch: Union[SampleBatch, MultiAgentBatch], - policy_id: PolicyID = DEFAULT_POLICY_ID, -) -> SampleBatch: - """Returns a SampleBatch. If MultiAgentBatch then return the SampleBatch - corresponding to the given policy_id. - """ - if type(batch) == SampleBatch: - return batch - elif type(batch) == MultiAgentBatch: - return batch.policy_batches[policy_id] - else: - raise NotImplementedError - - -class TestSegmentationBuffer(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_add(self): - """Test adding to segmentation buffer.""" - for buffer_cls in [SegmentationBuffer, MultiAgentSegmentationBuffer]: - max_seq_len = 3 - max_ep_len = 10 - capacity = 1 - buffer = buffer_cls(capacity, max_seq_len, max_ep_len) - - # generate batch - episode_batches = [] - for i in range(4): - episode_batches.append(_generate_episode_batch(max_ep_len, i)) - batch = concat_samples(episode_batches) - - # add to buffer and check that only last one is kept (due to replacement) - buffer.add(batch) - - self.assertEqual( - len(_get_internal_buffer(buffer)), - 1, - "The internal buffer should only contain one SampleBatch since" - " the capacity is 1.", - ) - test_utils.check(episode_batches[-1], _get_internal_buffer(buffer)[0]) - - # add again - buffer.add(episode_batches[0]) - - test_utils.check(episode_batches[0], _get_internal_buffer(buffer)[0]) - - # make buffer of enough capacity - capacity = len(episode_batches) - buffer = buffer_cls(capacity, max_seq_len, max_ep_len) - - # add to buffer and make sure all are in - buffer.add(batch) - self.assertEqual( - len(_get_internal_buffer(buffer)), - len(episode_batches), - "internal buffer doesn't have the right number of episodes.", - ) - for i in range(len(episode_batches)): - test_utils.check(episode_batches[i], _get_internal_buffer(buffer)[i]) - - # add another one and make sure it replaced one of them - new_batch = _generate_episode_batch(max_ep_len, 12345) - buffer.add(new_batch) - self.assertEqual( - len(_get_internal_buffer(buffer)), - len(episode_batches), - "internal buffer doesn't have the right number of episodes.", - ) - found = False - for episode_batch in _get_internal_buffer(buffer): - if episode_batch[SampleBatch.EPS_ID][0] == 12345: - test_utils.check(episode_batch, new_batch) - found = True - break - assert found, "new_batch not added to buffer." - - # test that adding too long an episode errors - long_batch = _generate_episode_batch(max_ep_len + 1, 123) - with self.assertRaises(ValueError): - buffer.add(long_batch) - - def test_sample_basic(self): - """Test sampling from a segmentation buffer.""" - for buffer_cls in (SegmentationBuffer, MultiAgentSegmentationBuffer): - max_seq_len = 5 - max_ep_len = 15 - capacity = 4 - obs_dim = 10 - act_dim = 2 - - buffer = buffer_cls(capacity, max_seq_len, max_ep_len) - - # generate batch and add to buffer - episode_batches = [] - for i in range(8): - episode_batches.append( - _generate_episode_batch(max_ep_len, i, obs_dim, act_dim) - ) - batch = concat_samples(episode_batches) - buffer.add(batch) - - # sample a few times and check shape - for bs in range(10, 20): - batch = _as_sample_batch(buffer.sample(bs)) - # check the keys exist - _assert_sample_batch_keys(batch) - - # check the shapes - self.assertEquals( - batch[SampleBatch.OBS].shape, (bs, max_seq_len, obs_dim) - ) - self.assertEquals( - batch[SampleBatch.ACTIONS].shape, (bs, max_seq_len, act_dim) - ) - self.assertEquals( - batch[SampleBatch.RETURNS_TO_GO].shape, - ( - bs, - max_seq_len + 1, - 1, - ), - ) - self.assertEquals(batch[SampleBatch.T].shape, (bs, max_seq_len)) - self.assertEquals( - batch[SampleBatch.ATTENTION_MASKS].shape, (bs, max_seq_len) - ) - - def test_sample_content(self): - """Test that the content of the sampling are valid.""" - for buffer_cls in (SegmentationBuffer, MultiAgentSegmentationBuffer): - max_seq_len = 5 - max_ep_len = 200 - capacity = 1 - obs_dim = 11 - act_dim = 1 - - buffer = buffer_cls(capacity, max_seq_len, max_ep_len) - - # generate single episode and add to buffer - episode = _generate_episode_batch(max_ep_len, 123, obs_dim, act_dim) - buffer.add(episode) - - # sample twice and make sure they are not equal. - # with a 200 max_ep_len and 200 samples, the probability that the two - # samples are equal by chance is (1/200)**200 which is basically zero. - sample1 = _as_sample_batch(buffer.sample(200)) - sample2 = _as_sample_batch(buffer.sample(200)) - _assert_sample_batch_keys(sample1) - _assert_sample_batch_keys(sample2) - _assert_sample_batch_not_equal(sample1, sample2) - - # sample and make sure the segments are actual segments of the episode - batch = _as_sample_batch(buffer.sample(1000)) - _assert_sample_batch_keys(batch) - for elem in batch.rows(): - _assert_is_segment(SampleBatch(elem), episode) - - def test_sample_capacity(self): - """Test that sampling from buffer of capacity > 1 works.""" - for buffer_cls in (SegmentationBuffer, MultiAgentSegmentationBuffer): - max_seq_len = 3 - max_ep_len = 10 - capacity = 100 - obs_dim = 1 - act_dim = 1 - - buffer = buffer_cls(capacity, max_seq_len, max_ep_len) - - # Generate batch and add to buffer - episode_batches = [] - for i in range(capacity): - episode_batches.append( - _generate_episode_batch(max_ep_len, i, obs_dim, act_dim) - ) - buffer.add(concat_samples(episode_batches)) - - # Sample 100 times and check that samples are from at least 2 different - # episodes. The [robability of all sampling from 1 episode by chance is - # (1/100)**99 which is basically zero. - batch = _as_sample_batch(buffer.sample(100)) - eps_ids = set() - for i in range(100): - # obs generated by _generate_episode_batch contains eps_id - # use -1 because there might be front padding - eps_id = int(batch[SampleBatch.OBS][i, -1, 0]) - eps_ids.add(eps_id) - - self.assertGreater( - len(eps_ids), 1, "buffer.sample is always returning the same episode." - ) - - def test_padding(self): - """Test that sample will front pad segments.""" - for buffer_cls in (SegmentationBuffer, MultiAgentSegmentationBuffer): - max_seq_len = 10 - max_ep_len = 100 - capacity = 1 - obs_dim = 3 - act_dim = 2 - - buffer = buffer_cls(capacity, max_seq_len, max_ep_len) - - for ep_len in range(1, max_seq_len): - # generate batch with episode lengths that are shorter than - # max_seq_len to test padding. - batch = _generate_episode_batch(ep_len, 123, obs_dim, act_dim) - buffer.add(batch) - - samples = _as_sample_batch(buffer.sample(50)) - for i in range(50): - # calculate number of pads based on the attention mask. - num_pad = int( - ep_len - samples[SampleBatch.ATTENTION_MASKS][i].sum() - ) - for key in samples.keys(): - # make sure padding are added. - assert np.allclose( - samples[key][i, :num_pad], 0.0 - ), "samples were not padded correctly." - - def test_multi_agent(self): - max_seq_len = 5 - max_ep_len = 20 - capacity = 10 - obs_dim = 3 - act_dim = 5 - - ma_buffer = MultiAgentSegmentationBuffer(capacity, max_seq_len, max_ep_len) - - policy_id1 = "1" - policy_id2 = "2" - policy_id3 = "3" - policy_ids = {policy_id1, policy_id2, policy_id3} - - policy1_batches = [] - for i in range(0, 10): - policy1_batches.append( - _generate_episode_batch(max_ep_len, i, obs_dim, act_dim) - ) - policy2_batches = [] - for i in range(10, 20): - policy2_batches.append( - _generate_episode_batch(max_ep_len, i, obs_dim, act_dim) - ) - policy3_batches = [] - for i in range(20, 30): - policy3_batches.append( - _generate_episode_batch(max_ep_len, i, obs_dim, act_dim) - ) - - batches_mapping = { - policy_id1: policy1_batches, - policy_id2: policy2_batches, - policy_id3: policy3_batches, - } - - ma_batch = MultiAgentBatch( - { - policy_id1: concat_samples(policy1_batches), - policy_id2: concat_samples(policy2_batches), - policy_id3: concat_samples(policy3_batches), - }, - max_ep_len * 10, - ) - - ma_buffer.add(ma_batch) - - # check all are added properly - for policy_id in policy_ids: - assert policy_id in ma_buffer.buffers.keys() - - for policy_id, buffer in ma_buffer.buffers.items(): - assert policy_id in policy_ids - for i in range(10): - test_utils.check( - batches_mapping[policy_id][i], _get_internal_buffer(buffer)[i] - ) - - # check that sampling are proper - for _ in range(50): - ma_sample = ma_buffer.sample(100) - for policy_id in policy_ids: - assert policy_id in ma_sample.policy_batches.keys() - - for policy_id, batch in ma_sample.policy_batches.items(): - eps_id_start = (int(policy_id) - 1) * 10 - eps_id_end = eps_id_start + 10 - - _assert_sample_batch_keys(batch) - - for i in range(100): - # Obs generated by _generate_episode_batch contains eps_id. - # Use -1 index because there might be front padding - eps_id = int(batch[SampleBatch.OBS][i, -1, 0]) - assert ( - eps_id_start <= eps_id < eps_id_end - ), "batch within multi agent batch has the wrong agent's episode." - - # sample twice and make sure they are not equal (probability equal almost zero) - ma_sample1 = ma_buffer.sample(200) - ma_sample2 = ma_buffer.sample(200) - for policy_id in policy_ids: - _assert_sample_batch_not_equal( - ma_sample1.policy_batches[policy_id], - ma_sample2.policy_batches[policy_id], - ) - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/dt/tuned_examples/__init__.py b/rllib_contrib/dt/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/dt/tuned_examples/cartpole-v1-dt.yaml b/rllib_contrib/dt/tuned_examples/cartpole-v1-dt.yaml deleted file mode 100644 index ba1632d4b43f5..0000000000000 --- a/rllib_contrib/dt/tuned_examples/cartpole-v1-dt.yaml +++ /dev/null @@ -1,45 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole_dt: - env: CartPole-v1 - run: DT - stop: - evaluation/sampler_results/episode_reward_mean: 200 - training_iteration: 100 - config: - input: dataset - input_config: - paths: s3://anonymous@air-example-data/rllib/cartpole/large.json - format: json - num_workers: 3 - actions_in_input_normalized: True - clip_actions: False - # training - framework: torch - train_batch_size: 512 - min_train_timesteps_per_iteration: 5000 - target_return: 200 - lr: 0.01 - optimizer: - weight_decay: 0.1 - betas: [0.9, 0.999] - replay_buffer_config: - capacity: 20 - # model - model: - max_seq_len: 3 - num_layers: 1 - num_heads: 1 - embed_dim: 64 - # Episode horizon: Must match environment's time limit, if any. - horizon: 500 - # evaluation - evaluation_config: - explore: False - input: sampler - evaluation_duration: 10 - evaluation_duration_unit: episodes - evaluation_interval: 1 - evaluation_num_workers: 1 - evaluation_parallel_to_training: True diff --git a/rllib_contrib/dt/tuned_examples/pendulum-v1-dt.yaml b/rllib_contrib/dt/tuned_examples/pendulum-v1-dt.yaml deleted file mode 100644 index 5d6bef2df96b3..0000000000000 --- a/rllib_contrib/dt/tuned_examples/pendulum-v1-dt.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pendulum_dt: - env: Pendulum-v1 - run: DT - stop: - evaluation/sampler_results/episode_reward_mean: -300 - timesteps_total: 20000000 - config: - input: dataset - input_config: - # Get this from: https://github.com/ray-project/ray/raw/releases/2.5.1/rllib/tests/data/pendulum/pendulum_expert_sac_50eps.zip - paths: rllib_contrib/dt/tuned_examples/pendulum_expert_sac_50eps.zip - format: json - num_workers: 3 - actions_in_input_normalized: True - clip_actions: True - normalize_actions: True - # training - framework: torch - train_batch_size: 512 - min_train_timesteps_per_iteration: 5000 - target_return: -120.0 - lr: 0.0 - lr_schedule: [[0, 0.0], [10000, 0.01]] - grad_clip: 1.0 - optimizer: - weight_decay: 0.1 - betas: [0.9, 0.999] - replay_buffer_config: - capacity: 20 - # model - model: - max_seq_len: 3 - num_layers: 1 - num_heads: 1 - embed_dim: 64 - # rollout - horizon: 200 - # evaluation - evaluation_config: - explore: False - input: sampler - evaluation_duration: 10 - evaluation_duration_unit: episodes - evaluation_interval: 1 - evaluation_num_workers: 1 - evaluation_parallel_to_training: True \ No newline at end of file diff --git a/rllib_contrib/dt/tuned_examples/pendulum-v1-medium-expert-dt.yaml b/rllib_contrib/dt/tuned_examples/pendulum-v1-medium-expert-dt.yaml deleted file mode 100644 index dd08c88c3902f..0000000000000 --- a/rllib_contrib/dt/tuned_examples/pendulum-v1-medium-expert-dt.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pendulum_medium_expert_dt: - env: Pendulum-v1 - run: DT - stop: - # We could make this higher, but given that we have 4 cpus for our tests, we will have to settle for -350. - evaluation/sampler_results/episode_reward_mean: -350 - timesteps_total: 20000000 - config: - input: dataset - input_config: - paths: - # Get these from: https://github.com/ray-project/ray/raw/releases/2.5.1/rllib/tests/data/pendulum/pendulum_[expert|medium]_sac_50eps.zip - - rllib_contrib/dt/tuned_examples/pendulum_expert_sac_50eps.zip - - rllib_contrib/dt/tuned_examples/pendulum_medium_sac_50eps.zip - format: json - num_workers: 3 - actions_in_input_normalized: True - clip_actions: True - normalize_actions: True - # training - framework: torch - train_batch_size: 512 - min_train_timesteps_per_iteration: 5000 - target_return: -120.0 - lr: 0.0 - lr_schedule: [[0, 0.0], [100000, 0.01]] - grad_clip: 1.0 - optimizer: - weight_decay: 0.1 - betas: [0.9, 0.999] - replay_buffer_config: - capacity: 20 - # model - model: - max_seq_len: 3 - num_layers: 1 - num_heads: 1 - embed_dim: 64 - # rollout - horizon: 200 - # evaluation - evaluation_config: - explore: False - input: sampler - evaluation_duration: 10 - evaluation_duration_unit: episodes - evaluation_interval: 1 - evaluation_num_workers: 1 - evaluation_parallel_to_training: True \ No newline at end of file diff --git a/rllib_contrib/es/BUILD b/rllib_contrib/es/BUILD deleted file mode 100644 index c16acee8b5dfd..0000000000000 --- a/rllib_contrib/es/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -# Examples - -py_test( - name = "example_es_cartpole_v1", - main = "es_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/es_cartpole_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -# Compilation Tests - -py_test( - name = "test_es", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_es.py"] -) diff --git a/rllib_contrib/es/README.md b/rllib_contrib/es/README.md deleted file mode 100644 index 2df12ec1d57b4..0000000000000 --- a/rllib_contrib/es/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# ES (Evolution Strategies) - -[ES](https://arxiv.org/abs/1703.03864) is a class of black box optimization algorithms, as an alternative to popular MDP-based RL techniques such as Q-learning and Policy Gradients. It is invariant to action frequency and delayed rewards, tolerant of extremely long horizons, and does not need temporal discounting or value function approximation. - - -## Installation - -``` -conda create -n rllib-es python=3.10 -conda activate rllib-es -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[ES Example]() \ No newline at end of file diff --git a/rllib_contrib/es/examples/es_cartpole_v1.py b/rllib_contrib/es/examples/es_cartpole_v1.py deleted file mode 100644 index 04cfb6aa166ee..0000000000000 --- a/rllib_contrib/es/examples/es_cartpole_v1.py +++ /dev/null @@ -1,50 +0,0 @@ -import argparse - -from rllib_es.es import ES, ESConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - ESConfig() - .rollouts(num_rollout_workers=2) - .framework("torch") - .environment("CartPole-v1") - .training(noise_size=25000000, episodes_per_batch=50) - ) - - stop_reward = 100 - - tuner = tune.Tuner( - ES, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 500000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved( - results, stop_reward, metric="sampler_results/episode_reward_mean" - ) diff --git a/rllib_contrib/es/pyproject.toml b/rllib_contrib/es/pyproject.toml deleted file mode 100644 index 57a3427d18510..0000000000000 --- a/rllib_contrib/es/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-es" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/es/requirements.txt b/rllib_contrib/es/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/es/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/es/src/rllib_es/es/__init__.py b/rllib_contrib/es/src/rllib_es/es/__init__.py deleted file mode 100644 index 80145e8c83d4f..0000000000000 --- a/rllib_contrib/es/src/rllib_es/es/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from rllib_es.es.es import ES, ESConfig -from rllib_es.es.es_tf_policy import ESTFPolicy -from rllib_es.es.es_torch_policy import ESTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = ["ES", "ESConfig", "ESTFPolicy", "ESTorchPolicy"] - -register_trainable("rllib-contrib-es", ES) diff --git a/rllib_contrib/es/src/rllib_es/es/es.py b/rllib_contrib/es/src/rllib_es/es/es.py deleted file mode 100644 index 31b5637b5664b..0000000000000 --- a/rllib_contrib/es/src/rllib_es/es/es.py +++ /dev/null @@ -1,623 +0,0 @@ -# Code in this file is copied and adapted from -# https://github.com/openai/evolution-strategies-starter. - -import logging -import random -import time -from collections import namedtuple -from typing import Dict, List, Optional - -import numpy as np -from rllib_es.es import optimizers, utils -from rllib_es.es.es_tf_policy import ESTFPolicy, rollout - -import ray -from ray.rllib.algorithms import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.env.env_context import EnvContext -from ray.rllib.evaluation.worker_set import WorkerSet -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils import FilterManager -from ray.rllib.utils.actor_manager import FaultAwareApply -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import Deprecated -from ray.rllib.utils.metrics import ( - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, -) -from ray.rllib.utils.torch_utils import set_torch_seed -from ray.rllib.utils.typing import PolicyID - -logger = logging.getLogger(__name__) - -Result = namedtuple( - "Result", - [ - "noise_indices", - "noisy_returns", - "sign_noisy_returns", - "noisy_lengths", - "eval_returns", - "eval_lengths", - ], -) - - -class ESConfig(AlgorithmConfig): - """Defines a configuration class from which an ES Algorithm can be built. - - Example: - >>> from ray.rllib.algorithms.es import ESConfig - >>> config = ESConfig() # doctest: +SKIP - >>> config = config.training(sgd_stepsize=0.02, report_length=20)#doctest: +SKIP - >>> config = config.resources(num_gpus=0) # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=4) # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.es import ESConfig - >>> from ray import tune - >>> config = ESConfig() - >>> # Print out some default values. - >>> print(config.action_noise_std) # doctest: +SKIP - >>> # Update the config object. - >>> config = config.training( # doctest: +SKIP - ... rollouts_used=tune.grid_search([32, 64]), eval_prob=0.5) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "ES", - ... run_config=ray.air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - - """ - - def __init__(self): - """Initializes a ESConfig instance.""" - super().__init__(algo_class=ES) - - # fmt: off - # __sphinx_doc_begin__ - - # ES specific settings: - self.action_noise_std = 0.01 - self.l2_coeff = 0.005 - self.noise_stdev = 0.02 - self.episodes_per_batch = 1000 - self.eval_prob = 0.03 - # self.return_proc_mode = "centered_rank" # only supported return_proc_mode - self.stepsize = 0.01 - self.noise_size = 250000000 - self.report_length = 10 - self.tf_single_threaded = True - - # Override some of AlgorithmConfig's default values with ES-specific values. - self.train_batch_size = 10000 - self.num_rollout_workers = 10 - self.observation_filter = "MeanStdFilter" - - # ES will use Algorithm's evaluation WorkerSet (if evaluation_interval > 0). - # Therefore, we must be careful not to use more than 1 env per eval worker - # (would break ESPolicy's compute_single_action method) and to not do - # obs-filtering. - self.evaluation( - evaluation_config=AlgorithmConfig.overrides( - num_envs_per_worker=1, - observation_filter="NoFilter", - ) - ) - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # __sphinx_doc_end__ - # fmt: on - - @override(AlgorithmConfig) - def training( - self, - *, - action_noise_std: Optional[float] = NotProvided, - l2_coeff: Optional[float] = NotProvided, - noise_stdev: Optional[int] = NotProvided, - episodes_per_batch: Optional[int] = NotProvided, - eval_prob: Optional[float] = NotProvided, - # return_proc_mode: Optional[int] = NotProvided, - stepsize: Optional[float] = NotProvided, - noise_size: Optional[int] = NotProvided, - report_length: Optional[int] = NotProvided, - tf_single_threaded: Optional[bool] = NotProvided, - **kwargs, - ) -> "ESConfig": - """Sets the training related configuration. - - Args: - action_noise_std: Std. deviation to be used when adding (standard normal) - noise to computed actions. Action noise is only added, if - `compute_actions` is called with the `add_noise` arg set to True. - l2_coeff: Coefficient to multiply current weights with inside the globalg - optimizer update term. - noise_stdev: Std. deviation of parameter noise. - episodes_per_batch: Minimum number of episodes to pack into the train batch. - eval_prob: Probability of evaluating the parameter rewards. - stepsize: SGD step-size used for the Adam optimizer. - noise_size: Number of rows in the noise table (shared across workers). - Each row contains a gaussian noise value for each model parameter. - report_length: How many of the last rewards we average over. - tf_single_threaded: Whether the tf-session should be generated without any - parallelism options. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if action_noise_std is not NotProvided: - self.action_noise_std = action_noise_std - if l2_coeff is not NotProvided: - self.l2_coeff = l2_coeff - if noise_stdev is not NotProvided: - self.noise_stdev = noise_stdev - if episodes_per_batch is not NotProvided: - self.episodes_per_batch = episodes_per_batch - if eval_prob is not NotProvided: - self.eval_prob = eval_prob - # Only supported return_proc mode is "centered_rank" right now. No need to - # configure this. - # if return_proc_mode is not NotProvided: - # self.return_proc_mode = return_proc_mode - if stepsize is not NotProvided: - self.stepsize = stepsize - if noise_size is not NotProvided: - self.noise_size = noise_size - if report_length is not NotProvided: - self.report_length = report_length - if tf_single_threaded is not NotProvided: - self.tf_single_threaded = tf_single_threaded - - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if self.num_gpus > 1: - raise ValueError("`num_gpus` > 1 not yet supported for ES!") - if self.num_rollout_workers <= 0: - raise ValueError("`num_rollout_workers` must be > 0 for ES!") - if ( - self.evaluation_config is not None - and self.evaluation_config.get("num_envs_per_worker") != 1 - ): - raise ValueError( - "`evaluation_config.num_envs_per_worker` must always be 1 for " - "ES! To parallelize evaluation, increase " - "`evaluation_num_workers` to > 1." - ) - if ( - self.evaluation_config is not None - and self.evaluation_config.get("observation_filter") != "NoFilter" - ): - raise ValueError( - "`evaluation_config.observation_filter` must always be " - "`NoFilter` for ES!" - ) - - -@ray.remote -def create_shared_noise(count): - """Create a large array of noise to be shared by all workers.""" - seed = 123 - noise = np.random.RandomState(seed).randn(count).astype(np.float32) - return noise - - -class SharedNoiseTable: - def __init__(self, noise): - self.noise = noise - assert self.noise.dtype == np.float32 - - def get(self, i, dim): - return self.noise[i : i + dim] - - def sample_index(self, dim): - return np.random.randint(0, len(self.noise) - dim + 1) - - -@ray.remote(max_restarts=-1) -class Worker(FaultAwareApply): - def __init__( - self, - config: AlgorithmConfig, - policy_params, - env_creator, - noise, - worker_index, - min_task_runtime=0.2, - ): - - # Set Python random, numpy, env, and torch/tf seeds. - seed = config.seed - if seed is not None: - # Python random module. - random.seed(seed) - # Numpy. - np.random.seed(seed) - # Torch. - if config.framework_str == "torch": - set_torch_seed(seed) - - self.min_task_runtime = min_task_runtime - self.config = config - self.config.update_from_dict(policy_params) - self.noise = SharedNoiseTable(noise) - - env_context = EnvContext(config.env_config, worker_index) - self.env = env_creator(env_context) - # Seed the env, if gym.Env. - if not hasattr(self.env, "seed"): - logger.info("Env doesn't support env.seed(): {}".format(self.env)) - # Gym.env. - else: - self.env.seed(seed) - - from ray.rllib import models - - self.preprocessor = models.ModelCatalog.get_preprocessor(self.env, config.model) - - _policy_class = get_policy_class(config) - self.policy = _policy_class( - self.env.observation_space, self.env.action_space, config.to_dict() - ) - - @property - def filters(self): - return {DEFAULT_POLICY_ID: self.policy.observation_filter} - - def sync_filters(self, new_filters): - for k in self.filters: - self.filters[k].sync(new_filters[k]) - - def get_filters(self, flush_after=False): - return_filters = {} - for k, f in self.filters.items(): - return_filters[k] = f.as_serializable() - if flush_after: - f.reset_buffer() - return return_filters - - def rollout(self, timestep_limit, add_noise=True): - rollout_rewards, rollout_fragment_length = rollout( - self.policy, self.env, timestep_limit=timestep_limit, add_noise=add_noise - ) - return rollout_rewards, rollout_fragment_length - - def do_rollouts(self, params, timestep_limit=None): - # Set the network weights. - self.policy.set_flat_weights(params) - - noise_indices, returns, sign_returns, lengths = [], [], [], [] - eval_returns, eval_lengths = [], [] - - # Perform some rollouts with noise. - task_tstart = time.time() - while ( - len(noise_indices) == 0 or time.time() - task_tstart < self.min_task_runtime - ): - - if np.random.uniform() < self.config.eval_prob: - # Do an evaluation run with no perturbation. - self.policy.set_flat_weights(params) - rewards, length = self.rollout(timestep_limit, add_noise=False) - eval_returns.append(rewards.sum()) - eval_lengths.append(length) - else: - # Do a regular run with parameter perturbations. - noise_index = self.noise.sample_index(self.policy.num_params) - - perturbation = self.config.noise_stdev * self.noise.get( - noise_index, self.policy.num_params - ) - - # These two sampling steps could be done in parallel on - # different actors letting us update twice as frequently. - self.policy.set_flat_weights(params + perturbation) - rewards_pos, lengths_pos = self.rollout(timestep_limit) - - self.policy.set_flat_weights(params - perturbation) - rewards_neg, lengths_neg = self.rollout(timestep_limit) - - noise_indices.append(noise_index) - returns.append([rewards_pos.sum(), rewards_neg.sum()]) - sign_returns.append( - [np.sign(rewards_pos).sum(), np.sign(rewards_neg).sum()] - ) - lengths.append([lengths_pos, lengths_neg]) - - return Result( - noise_indices=noise_indices, - noisy_returns=returns, - sign_noisy_returns=sign_returns, - noisy_lengths=lengths, - eval_returns=eval_returns, - eval_lengths=eval_lengths, - ) - - def stop(self): - """Releases all resources used by this RolloutWorker.""" - pass - - -def get_policy_class(config: AlgorithmConfig): - if config.framework_str == "torch": - from rllib_es.es.es_torch_policy import ESTorchPolicy - - policy_cls = ESTorchPolicy - else: - policy_cls = ESTFPolicy - return policy_cls - - -class ES(Algorithm): - """Large-scale implementation of Evolution Strategies in Ray.""" - - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return ESConfig() - - @override(Algorithm) - def setup(self, config): - # Setup our config: Merge the user-supplied config (which could - # be a partial config dict with the class' default). - if isinstance(config, dict): - self.config = self.get_default_config().update_from_dict(config) - - # Call super's validation method. - self.config.validate() - - # Generate the local env. - env_context = EnvContext(self.config.env_config or {}, worker_index=0) - env = self.env_creator(env_context) - - self.callbacks = self.config.callbacks_class() - - self._policy_class = get_policy_class(self.config) - self.policy = self._policy_class( - obs_space=env.observation_space, - action_space=env.action_space, - config=self.config, - ) - self.optimizer = optimizers.Adam(self.policy, self.config.stepsize) - self.report_length = self.config.report_length - - # Create the shared noise table. - logger.info("Creating shared noise table.") - noise_id = create_shared_noise.remote(self.config.noise_size) - self.noise = SharedNoiseTable(ray.get(noise_id)) - - # Create the actors. - logger.info("Creating actors.") - remote_workers = [ - Worker.remote(self.config, {}, self.env_creator, noise_id, idx + 1) - for idx in range(self.config.num_rollout_workers) - ] - self.workers = WorkerSet._from_existing( - local_worker=None, - remote_workers=remote_workers, - ) - - self.episodes_so_far = 0 - self.reward_list = [] - self.tstart = time.time() - - @override(Algorithm) - def get_policy(self, policy=DEFAULT_POLICY_ID): - if policy != DEFAULT_POLICY_ID: - raise ValueError( - "ES has no policy '{}'! Use {} " - "instead.".format(policy, DEFAULT_POLICY_ID) - ) - return self.policy - - @override(Algorithm) - def step(self): - config = self.config - - theta = self.policy.get_flat_weights() - assert theta.dtype == np.float32 - assert len(theta.shape) == 1 - - # Put the current policy weights in the object store. - theta_id = ray.put(theta) - # Use the actors to do rollouts. Note that we pass in the ID of the - # policy weights as these are shared. - results, num_episodes, num_timesteps = self._collect_results( - theta_id, config["episodes_per_batch"], config["train_batch_size"] - ) - # Update our sample steps counters. - self._counters[NUM_AGENT_STEPS_SAMPLED] += num_timesteps - self._counters[NUM_ENV_STEPS_SAMPLED] += num_timesteps - - all_noise_indices = [] - all_training_returns = [] - all_training_lengths = [] - all_eval_returns = [] - all_eval_lengths = [] - - # Loop over the results. - for result in results: - all_eval_returns += result.eval_returns - all_eval_lengths += result.eval_lengths - - all_noise_indices += result.noise_indices - all_training_returns += result.noisy_returns - all_training_lengths += result.noisy_lengths - - assert len(all_eval_returns) == len(all_eval_lengths) - assert ( - len(all_noise_indices) - == len(all_training_returns) - == len(all_training_lengths) - ) - - self.episodes_so_far += num_episodes - - # Assemble the results. - eval_returns = np.array(all_eval_returns) - eval_lengths = np.array(all_eval_lengths) - noise_indices = np.array(all_noise_indices) - noisy_returns = np.array(all_training_returns) - noisy_lengths = np.array(all_training_lengths) - - # Process the returns. - proc_noisy_returns = utils.compute_centered_ranks(noisy_returns) - - # Compute and take a step. - g, count = utils.batched_weighted_sum( - proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1], - (self.noise.get(index, self.policy.num_params) for index in noise_indices), - batch_size=500, - ) - g /= noisy_returns.size - assert ( - g.shape == (self.policy.num_params,) - and g.dtype == np.float32 - and count == len(noise_indices) - ) - # Compute the new weights theta. - theta, update_ratio = self.optimizer.update(-g + config["l2_coeff"] * theta) - - # Update our train steps counters. - self._counters[NUM_AGENT_STEPS_TRAINED] += num_timesteps - self._counters[NUM_ENV_STEPS_TRAINED] += num_timesteps - - # Set the new weights in the local copy of the policy. - self.policy.set_flat_weights(theta) - # Store the rewards - if len(all_eval_returns) > 0: - self.reward_list.append(np.mean(eval_returns)) - - # Bring restored workers back if necessary. - # We will sync filters right next. - self.workers.probe_unhealthy_workers() - - # Now sync the filters - FilterManager.synchronize( - {DEFAULT_POLICY_ID: self.policy.observation_filter}, self.workers - ) - - info = { - "weights_norm": np.square(theta).sum(), - "grad_norm": np.square(g).sum(), - "update_ratio": update_ratio, - "episodes_this_iter": noisy_lengths.size, - "episodes_so_far": self.episodes_so_far, - } - - reward_mean = np.mean(self.reward_list[-self.report_length :]) - result = { - "sampler_results": { - "episode_reward_mean": reward_mean, - "episode_len_mean": eval_lengths.mean(), - }, - "timesteps_this_iter": noisy_lengths.sum(), - "info": info, - } - - return result - - @override(Algorithm) - def restore_workers(self, workers: WorkerSet): - restored = self.workers.probe_unhealthy_workers() - if restored: - self._sync_weights_to_workers(worker_set=self.workers, worker_ids=restored) - - @override(Algorithm) - def compute_single_action(self, observation, *args, **kwargs): - action, _, _ = self.policy.compute_actions([observation], update=False) - if kwargs.get("full_fetch"): - return action[0], [], {} - return action[0] - - @Deprecated(new="compute_single_action", error=True) - def compute_action(self, observation, *args, **kwargs): - return self.compute_single_action(observation, *args, **kwargs) - - @override(Algorithm) - def _sync_weights_to_workers(self, *, worker_set=None, worker_ids=None): - # Broadcast the new policy weights to all evaluation workers. - assert worker_set is not None - logger.info("Synchronizing weights to evaluation workers.") - weights = ray.put(self.policy.get_flat_weights()) - worker_set.foreach_worker( - lambda w: w.foreach_policy( - lambda p, _: p.set_flat_weights(ray.get(weights)) - ), - local_worker=False, - remote_worker_ids=worker_ids, - ) - - @override(Algorithm) - def cleanup(self): - self.workers.stop() - - def _collect_results(self, theta_id, min_episodes, min_timesteps): - num_episodes, num_timesteps = 0, 0 - results = [] - while num_episodes < min_episodes or num_timesteps < min_timesteps: - logger.info( - "Collected {} episodes {} timesteps so far this iter".format( - num_episodes, num_timesteps - ) - ) - rollout_ids = self.workers.foreach_worker( - func=lambda w: w.do_rollouts(ray.get(theta_id)), - local_worker=False, - ) - # Get the results of the rollouts. - for result in rollout_ids: - results.append(result) - # Update the number of episodes and the number of timesteps - # keeping in mind that result.noisy_lengths is a list of lists, - # where the inner lists have length 2. - num_episodes += sum(len(pair) for pair in result.noisy_lengths) - num_timesteps += sum(sum(pair) for pair in result.noisy_lengths) - - return results, num_episodes, num_timesteps - - def get_weights(self, policies: Optional[List[PolicyID]] = None) -> dict: - return self.policy.get_flat_weights() - - def set_weights(self, weights: Dict[PolicyID, dict]): - self.policy.set_flat_weights(weights) - - def __getstate__(self): - return { - "weights": self.get_weights(), - "filter": self.policy.observation_filter, - "episodes_so_far": self.episodes_so_far, - } - - def __setstate__(self, state): - self.episodes_so_far = state["episodes_so_far"] - self.set_weights(state["weights"]) - self.policy.observation_filter = state["filter"] - FilterManager.synchronize( - {DEFAULT_POLICY_ID: self.policy.observation_filter}, self.workers - ) diff --git a/rllib_contrib/es/src/rllib_es/es/es_tf_policy.py b/rllib_contrib/es/src/rllib_es/es/es_tf_policy.py deleted file mode 100644 index 1014ed2c884fc..0000000000000 --- a/rllib_contrib/es/src/rllib_es/es/es_tf_policy.py +++ /dev/null @@ -1,219 +0,0 @@ -# Code in this file is copied and adapted from -# https://github.com/openai/evolution-strategies-starter. - -from typing import Optional - -import gymnasium as gym -import numpy as np -import tree # pip install dm_tree - -import ray -import ray.experimental.tf_utils -from ray.rllib.models import ModelCatalog -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils import deprecation_warning -from ray.rllib.utils.annotations import override -from ray.rllib.utils.filter import get_filter -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space, unbatch - -tf1, tf, tfv = try_import_tf() - - -def rollout( - policy: Policy, - env: gym.Env, - timestep_limit: Optional[int] = None, - add_noise: bool = False, - offset: float = 0.0, -): - """Do a rollout. - - If add_noise is True, the rollout will take noisy actions with - noise drawn from that stream. Otherwise, no action noise will be added. - - Args: - policy: RLlib Policy from which to draw actions. - env: Environment from which to draw rewards, done, and - next state. - timestep_limit: Steps after which to end the rollout. - If None, use `env.spec.max_episode_steps` or 999999. - add_noise: Indicates whether exploratory action noise should be - added. - offset: Value to subtract from the reward (e.g. survival bonus - from humanoid). - """ - max_timestep_limit = 999999 - env_timestep_limit = ( - env.spec.max_episode_steps - if (hasattr(env, "spec") and hasattr(env.spec, "max_episode_steps")) - else max_timestep_limit - ) - timestep_limit = ( - env_timestep_limit - if timestep_limit is None - else min(timestep_limit, env_timestep_limit) - ) - rewards = [] - t = 0 - observation, _ = env.reset() - for _ in range(timestep_limit or max_timestep_limit): - ac, _, _ = policy.compute_actions( - [observation], add_noise=add_noise, update=True - ) - ac = ac[0] - observation, r, terminated, truncated, _ = env.step(ac) - if offset != 0.0: - r -= np.abs(offset) - rewards.append(r) - t += 1 - if terminated or truncated: - break - rewards = np.array(rewards, dtype=np.float32) - return rewards, t - - -def make_session(single_threaded): - if not single_threaded: - return tf1.Session() - return tf1.Session( - config=tf1.ConfigProto( - inter_op_parallelism_threads=1, intra_op_parallelism_threads=1 - ) - ) - - -class ESTFPolicy(Policy): - def __init__(self, obs_space, action_space, config): - super().__init__(obs_space, action_space, config) - self.action_space_struct = get_base_struct_from_space(action_space) - self.action_noise_std = self.config["action_noise_std"] - self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space) - self.observation_filter = get_filter( - self.config["observation_filter"], self.preprocessor.shape - ) - if self.config["framework"] == "tf": - self.sess = make_session( - single_threaded=self.config.get("tf_single_threaded", True) - ) - - # Set graph-level seed. - if config.get("seed") is not None: - with self.sess.as_default(): - tf1.set_random_seed(config["seed"]) - - self.inputs = tf1.placeholder( - tf.float32, [None] + list(self.preprocessor.shape) - ) - else: - if not tf1.executing_eagerly(): - tf1.enable_eager_execution() - self.sess = self.inputs = None - if config.get("seed") is not None: - # Tf2.x. - if tfv == 2: - tf.random.set_seed(config["seed"]) - # Tf1.x. - else: - tf1.set_random_seed(config["seed"]) - - # Policy network. - self.dist_class, dist_dim = ModelCatalog.get_action_dist( - self.action_space, self.config["model"], dist_type="deterministic" - ) - - self.model = ModelCatalog.get_model_v2( - obs_space=self.preprocessor.observation_space, - action_space=action_space, - num_outputs=dist_dim, - model_config=self.config["model"], - ) - - self.sampler = None - if self.sess: - dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs}) - dist = self.dist_class(dist_inputs, self.model) - self.sampler = dist.sample() - self.variables = ray.experimental.tf_utils.TensorFlowVariables( - dist_inputs, self.sess - ) - self.sess.run(tf1.global_variables_initializer()) - else: - self.variables = ray.experimental.tf_utils.TensorFlowVariables( - [], None, self.model.variables() - ) - - self.num_params = sum( - np.prod(variable.shape.as_list()) - for _, variable in self.variables.variables.items() - ) - - @override(Policy) - def compute_actions(self, obs_batch=None, add_noise=False, update=True, **kwargs): - if "observation" in kwargs: - assert obs_batch is None, ( - "You can not use both arguments, " - "`observation` and `obs_batch`. `observation` " - "is deprecated." - ) - deprecation_warning( - old="ESTFPolicy.compute_actions(observation=...)`", - new="ESTFPolicy.compute_actions(obs_batch=...)", - ) - obs_batch = kwargs["observation"] - else: - assert obs_batch is not None - # Squeeze batch dimension (we always calculate actions for only a - # single obs). - observation = obs_batch[0] - observation = self.preprocessor.transform(observation) - observation = self.observation_filter(observation[None], update=update) - # `actions` is a list of (component) batches. - # Eager mode. - if not self.sess: - dist_inputs, _ = self.model({SampleBatch.CUR_OBS: observation}) - dist = self.dist_class(dist_inputs, self.model) - actions = dist.sample() - actions = tree.map_structure(lambda a: a.numpy(), actions) - # Graph mode. - else: - actions = self.sess.run(self.sampler, feed_dict={self.inputs: observation}) - - if add_noise: - actions = tree.map_structure( - self._add_noise, actions, self.action_space_struct - ) - # Convert `flat_actions` to a list of lists of action components - # (list of single actions). - actions = unbatch(actions) - return actions, [], {} - - def compute_single_action( - self, observation, add_noise=False, update=True, **kwargs - ): - action, state_outs, extra_fetches = self.compute_actions( - [observation], add_noise=add_noise, update=update, **kwargs - ) - return action[0], state_outs, extra_fetches - - def _add_noise(self, single_action, single_action_space): - if isinstance( - single_action_space, gym.spaces.Box - ) and single_action_space.dtype.name.startswith("float"): - single_action += ( - np.random.randn(*single_action.shape) * self.action_noise_std - ) - return single_action - - def get_state(self): - return {"state": self.get_flat_weights()} - - def set_state(self, state): - return self.set_flat_weights(state["state"]) - - def set_flat_weights(self, x): - self.variables.set_flat(x) - - def get_flat_weights(self): - return self.variables.get_flat() diff --git a/rllib_contrib/es/src/rllib_es/es/es_torch_policy.py b/rllib_contrib/es/src/rllib_es/es/es_torch_policy.py deleted file mode 100644 index 4028702e0ef2e..0000000000000 --- a/rllib_contrib/es/src/rllib_es/es/es_torch_policy.py +++ /dev/null @@ -1,132 +0,0 @@ -# Code in this file is adapted from: -# https://github.com/openai/evolution-strategies-starter. - -import gymnasium as gym -import numpy as np -import tree # pip install dm_tree - -import ray -from ray.rllib.models import ModelCatalog -from ray.rllib.policy.policy_template import build_policy_class -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.filter import get_filter -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space, unbatch -from ray.rllib.utils.torch_utils import convert_to_torch_tensor - -torch, _ = try_import_torch() - - -def before_init(policy, observation_space, action_space, config): - policy.action_noise_std = config["action_noise_std"] - policy.action_space_struct = get_base_struct_from_space(action_space) - policy.preprocessor = ModelCatalog.get_preprocessor_for_space(observation_space) - policy.observation_filter = get_filter( - config["observation_filter"], policy.preprocessor.shape - ) - - def _set_flat_weights(policy, theta): - pos = 0 - theta_dict = policy.model.state_dict() - new_theta_dict = {} - - for k in sorted(theta_dict.keys()): - shape = policy.param_shapes[k] - num_params = int(np.prod(shape)) - new_theta_dict[k] = torch.from_numpy( - np.reshape(theta[pos : pos + num_params], shape) - ) - pos += num_params - policy.model.load_state_dict(new_theta_dict) - - def _get_flat_weights(policy): - # Get the parameter tensors. - theta_dict = policy.model.state_dict() - # Flatten it into a single np.ndarray. - theta_list = [] - for k in sorted(theta_dict.keys()): - theta_list.append(torch.reshape(theta_dict[k], (-1,))) - cat = torch.cat(theta_list, dim=0) - return cat.cpu().numpy() - - type(policy).set_flat_weights = _set_flat_weights - type(policy).get_flat_weights = _get_flat_weights - - def _compute_actions(policy, obs_batch, add_noise=False, update=True, **kwargs): - # Batch is given as list -> Try converting to numpy first. - if isinstance(obs_batch, list) and len(obs_batch) == 1: - obs_batch = obs_batch[0] - observation = policy.preprocessor.transform(obs_batch) - observation = policy.observation_filter(observation[None], update=update) - - observation = convert_to_torch_tensor(observation, policy.device) - dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [], None) - dist = policy.dist_class(dist_inputs, policy.model) - action = dist.sample() - - def _add_noise(single_action, single_action_space): - single_action = single_action.detach().cpu().numpy() - if ( - add_noise - and isinstance(single_action_space, gym.spaces.Box) - and single_action_space.dtype.name.startswith("float") - ): - single_action += ( - np.random.randn(*single_action.shape) * policy.action_noise_std - ) - return single_action - - action = tree.map_structure(_add_noise, action, policy.action_space_struct) - action = unbatch(action) - return action, [], {} - - def _compute_single_action( - policy, observation, add_noise=False, update=True, **kwargs - ): - action, state_outs, extra_fetches = policy.compute_actions( - [observation], add_noise=add_noise, update=update, **kwargs - ) - return action[0], state_outs, extra_fetches - - type(policy).compute_actions = _compute_actions - type(policy).compute_single_action = _compute_single_action - - -def after_init(policy, observation_space, action_space, config): - state_dict = policy.model.state_dict() - policy.param_shapes = { - k: tuple(state_dict[k].size()) for k in sorted(state_dict.keys()) - } - policy.num_params = sum(np.prod(s) for s in policy.param_shapes.values()) - - -def make_model_and_action_dist(policy, observation_space, action_space, config): - # Policy network. - dist_class, dist_dim = ModelCatalog.get_action_dist( - action_space, - config["model"], # model_options - dist_type="deterministic", - framework="torch", - ) - model = ModelCatalog.get_model_v2( - policy.preprocessor.observation_space, - action_space, - num_outputs=dist_dim, - model_config=config["model"], - framework="torch", - ) - # Make all model params not require any gradients. - for p in model.parameters(): - p.requires_grad = False - return model, dist_class - - -ESTorchPolicy = build_policy_class( - name="ESTorchPolicy", - framework="torch", - loss_fn=None, - get_default_config=lambda: ray.rllib.algorithms.es.es.ESConfig(), - before_init=before_init, - after_init=after_init, - make_model_and_action_dist=make_model_and_action_dist, -) diff --git a/rllib_contrib/es/src/rllib_es/es/optimizers.py b/rllib_contrib/es/src/rllib_es/es/optimizers.py deleted file mode 100644 index 7bc39e98effca..0000000000000 --- a/rllib_contrib/es/src/rllib_es/es/optimizers.py +++ /dev/null @@ -1,53 +0,0 @@ -# Code in this file is copied and adapted from -# https://github.com/openai/evolution-strategies-starter. - -import numpy as np - - -class Optimizer: - def __init__(self, policy): - self.policy = policy - self.dim = policy.num_params - self.t = 0 - - def update(self, globalg): - self.t += 1 - step = self._compute_step(globalg) - theta = self.policy.get_flat_weights() - ratio = np.linalg.norm(step) / np.linalg.norm(theta) - return theta + step, ratio - - def _compute_step(self, globalg): - raise NotImplementedError - - -class SGD(Optimizer): - def __init__(self, policy, stepsize, momentum=0.0): - Optimizer.__init__(self, policy) - self.v = np.zeros(self.dim, dtype=np.float32) - self.stepsize, self.momentum = stepsize, momentum - - def _compute_step(self, globalg): - self.v = self.momentum * self.v + (1.0 - self.momentum) * globalg - step = -self.stepsize * self.v - return step - - -class Adam(Optimizer): - def __init__(self, policy, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08): - Optimizer.__init__(self, policy) - self.stepsize = stepsize - self.beta1 = beta1 - self.beta2 = beta2 - self.epsilon = epsilon - self.m = np.zeros(self.dim, dtype=np.float32) - self.v = np.zeros(self.dim, dtype=np.float32) - - def _compute_step(self, globalg): - a = self.stepsize * ( - np.sqrt(1 - self.beta2**self.t) / (1 - self.beta1**self.t) - ) - self.m = self.beta1 * self.m + (1 - self.beta1) * globalg - self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) - step = -a * self.m / (np.sqrt(self.v) + self.epsilon) - return step diff --git a/rllib_contrib/es/src/rllib_es/es/utils.py b/rllib_contrib/es/src/rllib_es/es/utils.py deleted file mode 100644 index 3c6bb03ea1a22..0000000000000 --- a/rllib_contrib/es/src/rllib_es/es/utils.py +++ /dev/null @@ -1,50 +0,0 @@ -# Code in this file is copied and adapted from -# https://github.com/openai/evolution-strategies-starter. - -import numpy as np - - -def compute_ranks(x): - """Returns ranks in [0, len(x)) - - Note: This is different from scipy.stats.rankdata, which returns ranks in - [1, len(x)]. - """ - assert x.ndim == 1 - ranks = np.empty(len(x), dtype=int) - ranks[x.argsort()] = np.arange(len(x)) - return ranks - - -def compute_centered_ranks(x): - y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32) - y /= x.size - 1 - y -= 0.5 - return y - - -def itergroups(items, group_size): - assert group_size >= 1 - group = [] - for x in items: - group.append(x) - if len(group) == group_size: - yield tuple(group) - del group[:] - if group: - yield tuple(group) - - -def batched_weighted_sum(weights, vecs, batch_size): - total = 0 - num_items_summed = 0 - for batch_weights, batch_vecs in zip( - itergroups(weights, batch_size), itergroups(vecs, batch_size) - ): - assert len(batch_weights) == len(batch_vecs) <= batch_size - total += np.dot( - np.asarray(batch_weights, dtype=np.float32), - np.asarray(batch_vecs, dtype=np.float32), - ) - num_items_summed += len(batch_weights) - return total, num_items_summed diff --git a/rllib_contrib/es/tests/test_es.py b/rllib_contrib/es/tests/test_es.py deleted file mode 100644 index 8be96387c87bf..0000000000000 --- a/rllib_contrib/es/tests/test_es.py +++ /dev/null @@ -1,80 +0,0 @@ -import unittest - -import numpy as np -import rllib_es.es.es as es - -import ray -from ray.rllib.utils.test_utils import check_compute_single_action, framework_iterator - - -class TestES(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init(num_cpus=4) - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_es_compilation(self): - """Test whether an ESAlgorithm can be built on all frameworks.""" - config = es.ESConfig() - # Keep it simple. - config.training( - model={ - "fcnet_hiddens": [10], - "fcnet_activation": None, - }, - noise_size=2500000, - episodes_per_batch=10, - train_batch_size=100, - ) - config.rollouts(num_rollout_workers=1) - # Test eval workers ("normal" WorkerSet, unlike ES' list of - # RolloutWorkers used for collecting train batches). - config.evaluation(evaluation_interval=1, evaluation_num_workers=2) - - num_iterations = 1 - - for _ in framework_iterator(config): - for env in ["CartPole-v1", "Pendulum-v1"]: - algo = config.build(env=env) - for i in range(num_iterations): - results = algo.train() - print(results) - - check_compute_single_action(algo) - algo.stop() - ray.shutdown() - - def test_es_weights(self): - """Test whether an ESAlgorithm can be built on all frameworks.""" - config = es.ESConfig() - # Keep it simple. - config.training( - model={ - "fcnet_hiddens": [10], - "fcnet_activation": None, - }, - noise_size=2500000, - episodes_per_batch=10, - train_batch_size=100, - ) - config.rollouts(num_rollout_workers=1) - - for _ in framework_iterator(config): - algo = config.build(env="CartPole-v1") - - weights = np.zeros_like(algo.get_weights()) - algo.set_weights(weights=weights) - new_weights = algo.get_weights() - - self.assertTrue(np.array_equal(weights, new_weights)) - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/es/tuned_examples/__init__.py b/rllib_contrib/es/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/es/tuned_examples/cartpole-es.yaml b/rllib_contrib/es/tuned_examples/cartpole-es.yaml deleted file mode 100644 index 0115f80aab8c6..0000000000000 --- a/rllib_contrib/es/tuned_examples/cartpole-es.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-es: - env: CartPole-v1 - run: ES - stop: - sampler_results/episode_reward_mean: 100 - timesteps_total: 500000 - config: - # Works for both torch and tf. - framework: torch - num_workers: 2 - noise_size: 25000000 - episodes_per_batch: 50 diff --git a/rllib_contrib/es/tuned_examples/humanoid-es.yaml b/rllib_contrib/es/tuned_examples/humanoid-es.yaml deleted file mode 100644 index 5b2145041f68a..0000000000000 --- a/rllib_contrib/es/tuned_examples/humanoid-es.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -humanoid-v2-es: - env: Humanoid-v2 - run: ES - stop: - sampler_results/episode_reward_mean: 6000 - config: - # Works for both torch and tf. - framework: torch - num_workers: 100 diff --git a/rllib_contrib/leela_chess_zero/BUILD b/rllib_contrib/leela_chess_zero/BUILD deleted file mode 100644 index a665404c05f8f..0000000000000 --- a/rllib_contrib/leela_chess_zero/BUILD +++ /dev/null @@ -1,32 +0,0 @@ -# Examples - -py_test( - name = "example_leela_chess_zero_connect_4", - main = "leela_chess_zero_connect_4.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/leela_chess_zero_connect_4.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_multi_agent_chess_leela_chess_zero", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/multi-agent-chess-leela-chess-zero.py"], - args = ["--dir=leela_chess_zero/tuned_examples/", "--framework=torch"] -) - - -# Compilation Tests - -py_test( - name = "test_leela_chess_zero", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_leela_chess_zero.py"] -) diff --git a/rllib_contrib/leela_chess_zero/README.md b/rllib_contrib/leela_chess_zero/README.md deleted file mode 100644 index d7e2bbaa4629e..0000000000000 --- a/rllib_contrib/leela_chess_zero/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Leela Chess Zero - -[Leela Chess Zero](https://lczero.org/) Leela chess zero is an algorithm made to train agents on the Leela Chess Engine. The Leela Chess Zero’s neural network is largely based on the DeepMind’s AlphaGo Zero and AlphaZero architecture. There are however some changes. It should be trained in a competition with multiple versions of its past self. - -The policy/model assumes that the environment is a MultiAgent Chess environment, that has a discrete action space and returns an observation as a dictionary with two keys: - - - `obs` that contains an observation under either the form of a state vector or an image - - `action_mask` that contains a mask over the legal actions - - It should also implement a `get_state`and a `set_state` function, used in the MCTS implementation. - - The model used in AlphaZero trainer should extend `TorchModelV2` and implement the method `compute_priors_and_value`. - - -## References - -- AlphaZero: https://arxiv.org/abs/1712.01815 -- LeelaChessZero: https://github.com/LeelaChessZero/lc0 - - - -## Installation - -``` -conda create -n rllib-leela-chess python=3.10 -conda activate rllib-leela-chess -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[Leela Chess Zero Example]() \ No newline at end of file diff --git a/rllib_contrib/leela_chess_zero/examples/custom_model.py b/rllib_contrib/leela_chess_zero/examples/custom_model.py deleted file mode 100644 index c9fd80ddd2ba6..0000000000000 --- a/rllib_contrib/leela_chess_zero/examples/custom_model.py +++ /dev/null @@ -1,52 +0,0 @@ -import torch.nn as nn - -from ray.rllib.algorithms.alpha_zero.models.custom_torch_models import ( - ActorCriticModel, - Flatten, -) - - -class DenseModel(ActorCriticModel): - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - ActorCriticModel.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - in_shape = 84 - self.shared_layers = nn.Sequential( - Flatten(), - nn.Linear(in_features=in_shape, out_features=2048), - nn.ReLU(), - nn.Linear(in_features=2048, out_features=2048), - nn.ReLU(), - ) - self.actor_layers = nn.Sequential( - nn.Linear(in_features=2048, out_features=action_space.n) - ) - self.critic_layers = nn.Sequential( - nn.Linear(in_features=2048, out_features=1), nn.Tanh() - ) - self._value_out = None - - def forward(self, input_dict, state, seq_lens): - try: - x = input_dict["obs"] - except KeyError: - x = input_dict["observation"] - x = self.shared_layers(x) - # actor outputs - logits = self.actor_layers(x) - - # compute value - self._value_out = self.critic_layers(x) - return logits, None - - -class PolicyMappingFn: - """Example for a callable class specifyable in yaml files as `policy_mapping_fn`.""" - - def __call__(self, agent_id, episode, worker, **kwargs): - return "p_" + ( - str("0" if int(agent_id.split("_")[-1]) % 2 == 0 else "1") - if (episode.episode_id) % 2 == 0 - else str("1" if int(agent_id.split("_")[-1]) % 2 == 0 else "0") - ) diff --git a/rllib_contrib/leela_chess_zero/examples/leela_chess_zero_connect_4.py b/rllib_contrib/leela_chess_zero/examples/leela_chess_zero_connect_4.py deleted file mode 100644 index 769a668df3d7e..0000000000000 --- a/rllib_contrib/leela_chess_zero/examples/leela_chess_zero_connect_4.py +++ /dev/null @@ -1,54 +0,0 @@ -import argparse - -from custom_model import DenseModel, PolicyMappingFn -from rllib_leela_chess_zero.leela_chess_zero import LeelaChessZero, LeelaChessZeroConfig - -import ray -from ray import air, tune -from ray.rllib.examples.env.pettingzoo_connect4 import MultiAgentConnect4 - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - LeelaChessZeroConfig() - .rollouts(num_rollout_workers=7) - .framework("torch") - .environment(MultiAgentConnect4) - .training(model={"custom_model": DenseModel, "max_seq_len": 0}) - .multi_agent( - policies=["p_0", "p_1"], - policies_to_train=["p_0"], - policy_mapping_fn={"type": PolicyMappingFn}, - ) - ) - - if args.run_as_test: - stop = {"timesteps_total": 10000} - else: - stop = { - "policy_reward_mean/p_0": 0.9, - "timesteps_total": 1000000, - } - - tuner = tune.Tuner( - LeelaChessZero, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop=stop, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() diff --git a/rllib_contrib/leela_chess_zero/pyproject.toml b/rllib_contrib/leela_chess_zero/pyproject.toml deleted file mode 100644 index 07d0946ea75c1..0000000000000 --- a/rllib_contrib/leela_chess_zero/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-leela-chess-zero" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium==0.26.3", "pettingzoo==1.22.4", "chess==1.10.0", "ray[rllib]==2.5.0", "pygame"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/leela_chess_zero/requirements.txt b/rllib_contrib/leela_chess_zero/requirements.txt deleted file mode 100644 index b07006a1b4ec6..0000000000000 --- a/rllib_contrib/leela_chess_zero/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/__init__.py b/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/__init__.py deleted file mode 100644 index 564fa8a09f87a..0000000000000 --- a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from rllib_leela_chess_zero.leela_chess_zero.leela_chess_zero import ( - LeelaChessZero, - LeelaChessZeroConfig, -) -from rllib_leela_chess_zero.leela_chess_zero.leela_chess_zero_model import ( - LeelaChessZeroModel, -) -from rllib_leela_chess_zero.leela_chess_zero.leela_chess_zero_policy import ( - LeelaChessZeroPolicy, -) - -from ray.tune.registry import register_trainable - -__all__ = [ - "LeelaChessZero", - "LeelaChessZeroConfig", - "LeelaChessZeroModel", - "LeelaChessZeroPolicy", -] - -register_trainable("rllib-contrib-leela-chess-zero", LeelaChessZero) diff --git a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero.py b/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero.py deleted file mode 100644 index 7c6ab8f7cb91b..0000000000000 --- a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero.py +++ /dev/null @@ -1,416 +0,0 @@ -import logging -from typing import List, Optional, Type, Union - -from rllib_leela_chess_zero.leela_chess_zero.leela_chess_zero_model import ( - LeelaChessZeroModel, -) -from rllib_leela_chess_zero.leela_chess_zero.leela_chess_zero_policy import ( - LeelaChessZeroPolicy, -) -from rllib_leela_chess_zero.leela_chess_zero.mcts import MCTS - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample -from ray.rllib.execution.train_ops import train_one_step -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import restore_original_dimensions -from ray.rllib.models.torch.torch_action_dist import TorchCategorical -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import concat_samples -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics import ( - NUM_AGENT_STEPS_SAMPLED, - NUM_ENV_STEPS_SAMPLED, - SYNCH_WORKER_WEIGHTS_TIMER, -) -from ray.rllib.utils.replay_buffers import PrioritizedReplayBuffer -from ray.rllib.utils.replay_buffers.utils import validate_buffer_config -from ray.rllib.utils.typing import ResultDict - -torch, nn = try_import_torch() - -logger = logging.getLogger(__name__) - - -class LeelaChessZeroDefaultCallbacks(DefaultCallbacks): - """LeelaChessZero callbacks. - If you use custom callbacks, you must extend this class and call super() - for on_episode_start. - """ - - def __init__(self): - super().__init__() - - @override(DefaultCallbacks) - def on_episode_start(self, worker, base_env, policies, episode, **kwargs): - # save env state when an episode starts - env = base_env.get_sub_environments()[0] - state = env.get_state() - episode.user_data["initial_state"] = state - episode.user_data["current_state"] = [state] - - @override(DefaultCallbacks) - def on_episode_step(self, worker, base_env, policies, episode, **kwargs) -> None: - env = base_env.get_sub_environments()[0] - state = env.get_state() - episode.user_data["current_state"].append(state) - - -class LeelaChessZeroConfig(AlgorithmConfig): - """Defines a configuration class from which a LeelaChessZero Algorithm can be built. - - Example: - >>> from ray.rllib.algorithms.leela_chess_zero as lc0 # doctest: +SKIP - >>> from lc0 import LeelaChessZeroConfig # doctest: +SKIP - >>> config = LeelaChessZeroConfig() # doctest: +SKIP - >>> config = config.training(sgd_minibatch_size=256) # doctest: +SKIP - >>> config = config..resources(num_gpus=0) # doctest: +SKIP - >>> config = config..rollouts(num_rollout_workers=4) # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.leela_chess_zero as lc0 # doctest: +SKIP - >>> from lc0 import LeelaChessZeroConfig # doctest: +SKIP - >>> from ray import air # doctest: +SKIP - >>> from ray import tune # doctest: +SKIP - >>> config = LeelaChessZeroConfig() # doctest: +SKIP - >>> # Print out some default values. - >>> print(config.shuffle_sequences) # doctest: +SKIP - >>> # Update the config object. - >>> config.training(lr=tune.grid_search([0.001, 0.0001])) # doctest: +SKIP - >>> # Set the config object's env. - >>> config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "LeelaChessZero", # doctest: +SKIP - ... run_config=air.RunConfig(stop={ # doctest: +SKIP - "episode_reward_mean": 200}), # doctest: +SKIP - ... param_space=config.to_dict(), # doctest: +SKIP - ... ).fit() # doctest: +SKIP - """ - - def __init__(self, algo_class=None): - """Initializes a LeelaChessZeroConfig instance.""" - super().__init__(algo_class=algo_class or LeelaChessZero) - - # fmt: off - # __sphinx_doc_begin__ - # LeelaChessZero specific config settings: - self.sgd_minibatch_size = 256 - self.shuffle_sequences = True - self.num_sgd_iter = 30 - self.replay_buffer_config = { - "_enable_replay_buffer_api": True, - "type": "MultiAgentReplayBuffer", - "underlying_replay_buffer_config": { - "type": PrioritizedReplayBuffer, - "capacity": 10000, "storage_unit": "episodes", - "prioritized_replay_alpha": 0.6, "prioritized_replay_beta": 0.4, - "prioritized_replay_eps": 1e-6, - }, - } - # Number of timesteps to collect from rollout workers before we start - # sampling from replay buffers for learning. Whether we count this in agent - # steps or environment steps depends on config.multi_agent(count_steps_by=..). - self.num_steps_sampled_before_learning_starts = 1000 - self.lr_schedule = None - self.vf_share_layers = False - self.mcts_config = { - "puct_coefficient": 2**0.5, - "num_simulations": 25, - "temperature": 1.5, - "dirichlet_epsilon": 0.25, - "dirichlet_noise": 0.03, - "argmax_tree_policy": True, - "add_dirichlet_noise": True, - "epsilon": 0.05, - "turn_based_flip": True, - "argmax_child_value": True, - } - self.model = {"custom_model" : LeelaChessZeroModel} - - # Override some of AlgorithmConfig's default values with AlphaZero-specific - # values. - self.framework_str = "torch" - self.callbacks_class = LeelaChessZeroDefaultCallbacks - self.lr = 1e-3 - self.num_rollout_workers = 8 - self.rollout_fragment_length = 200 - self.train_batch_size = 2048 - self.batch_mode = "complete_episodes" - # Extra configuration for eval that disables exploration. - self.evaluation(evaluation_config={ - "mcts_config": { - "argmax_tree_policy": True, - "add_dirichlet_noise": False, - }, - }) - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # __sphinx_doc_end__ - # fmt: on - - self.buffer_size = DEPRECATED_VALUE - - @override(AlgorithmConfig) - def callbacks( - self, *, callbacks_class: Optional[DefaultCallbacks] = NotProvided, **kwargs - ) -> "LeelaChessZeroConfig": - super().callbacks(callbacks_class, **kwargs) - - if callbacks_class is not NotProvided: - self.callbacks_class = callbacks_class - return self - - @override(AlgorithmConfig) - def training( - self, - *, - sgd_minibatch_size: Optional[int] = NotProvided, - shuffle_sequences: Optional[bool] = NotProvided, - num_sgd_iter: Optional[int] = NotProvided, - replay_buffer_config: Optional[dict] = NotProvided, - lr: Optional[float] = NotProvided, - lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, - vf_share_layers: Optional[bool] = NotProvided, - mcts_config: Optional[dict] = NotProvided, - num_steps_sampled_before_learning_starts: Optional[int] = NotProvided, - model: Optional[dict] = NotProvided, - **kwargs, - ) -> "LeelaChessZeroConfig": - """Sets the training related configuration. - - Args: - sgd_minibatch_size: Total SGD batch size across all devices for SGD. - shuffle_sequences: Whether to shuffle sequences in the batch when training - (recommended). - num_sgd_iter: Number of SGD iterations in each outer loop. - replay_buffer_config: Replay buffer config. - Examples: - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentReplayBuffer", - "learning_starts": 1000, - "capacity": 50000, - "replay_sequence_length": 1, - } - - OR - - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 50000, - "prioritized_replay_alpha": 0.6, - "prioritized_replay_beta": 0.4, - "prioritized_replay_eps": 1e-6, - "replay_sequence_length": 1, - } - - Where - - prioritized_replay_alpha: Alpha parameter controls the degree of - prioritization in the buffer. In other words, when a buffer sample has - a higher temporal-difference error, with how much more probability - should it drawn to use to update the parametrized Q-network. 0.0 - corresponds to uniform probability. Setting much above 1.0 may quickly - result as the sampling distribution could become heavily “pointy” with - low entropy. - prioritized_replay_beta: Beta parameter controls the degree of - importance sampling which suppresses the influence of gradient updates - from samples that have higher probability of being sampled via alpha - parameter and the temporal-difference error. - prioritized_replay_eps: Epsilon parameter sets the baseline probability - for sampling so that when the temporal-difference error of a sample is - zero, there is still a chance of drawing the sample. - lr_schedule: Learning rate schedule. In the format of - [[timestep, lr-value], [timestep, lr-value], ...] - Intermediary timesteps will be assigned to interpolated learning rate - values. A schedule should normally start from timestep 0. - vf_share_layers: Share layers for value function. If you set this to True, - it's important to tune vf_loss_coeff. - mcts_config: MCTS specific settings. - num_steps_sampled_before_learning_starts: Number of timesteps to collect - from rollout workers before we start sampling from replay buffers for - learning. Whether we count this in agent steps or environment steps - depends on config.multi_agent(count_steps_by=..). - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if sgd_minibatch_size is not NotProvided: - self.sgd_minibatch_size = sgd_minibatch_size - if shuffle_sequences is not NotProvided: - self.shuffle_sequences = shuffle_sequences - if num_sgd_iter is not NotProvided: - self.num_sgd_iter = num_sgd_iter - if replay_buffer_config is not NotProvided: - self.replay_buffer_config = replay_buffer_config - if lr is not NotProvided: - self.lr = lr - if lr_schedule is not NotProvided: - self.lr_schedule = lr_schedule - if vf_share_layers is not NotProvided: - self.vf_share_layers = vf_share_layers - if mcts_config is not NotProvided: - # only assign provided keys - for k, v in mcts_config.items(): - self.mcts_config[k] = v - if num_steps_sampled_before_learning_starts is not NotProvided: - self.num_steps_sampled_before_learning_starts = ( - num_steps_sampled_before_learning_starts - ) - if model is not NotProvided: - self.model = model - - return self - - @override(AlgorithmConfig) - def update_from_dict(self, config_dict) -> "LeelaChessZeroConfig": - config_dict = config_dict.copy() - - if "ranked_rewards" in config_dict: - value = config_dict.pop("ranked_rewards") - self.training(ranked_rewards=value) - - return super().update_from_dict(config_dict) - - @override(AlgorithmConfig) - def validate(self) -> None: - """Checks and updates the config based on settings.""" - # Call super's validation method. - super().validate() - validate_buffer_config(self) - - -def leela_chess_zero_loss(policy, model, dist_class, train_batch): - # get inputs unflattened inputs - input_dict = restore_original_dimensions( - train_batch["obs"], policy.observation_space, "torch" - ) - # forward pass in model - model_out = model.forward(input_dict, None, [1]) - logits, _ = model_out - values = model.value_function() - logits, values = torch.squeeze(logits), torch.squeeze(values) - priors = nn.Softmax(dim=-1)(logits) - # compute actor and critic losses - policy_loss = torch.mean( - -torch.sum(train_batch["mcts_policies"] * torch.log(priors), dim=-1) - ) - value_loss = torch.mean(torch.pow(values - train_batch["value_label"], 2)) - # compute total loss - total_loss = (policy_loss + value_loss) / 2 - return total_loss, policy_loss, value_loss - - -class LeelaChessZeroPolicyWrapperClass(LeelaChessZeroPolicy): - def __init__(self, obs_space, action_space, config): - model = ModelCatalog.get_model_v2( - obs_space, action_space, action_space.n, config["model"], "torch" - ) - _, env_creator = Algorithm._get_env_id_and_creator(config["env"], config) - - def _env_creator(): - return env_creator(config["env_config"]) - - def mcts_creator(): - mcts_params = config["mcts_config"] - return MCTS(model, mcts_params) - - super().__init__( - obs_space, - action_space, - config, - model, - leela_chess_zero_loss, - TorchCategorical, - mcts_creator, - _env_creator, - ) - - -class LeelaChessZero(Algorithm): - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return LeelaChessZeroConfig() - - @override(Algorithm) - def get_default_policy_class(self, *args, **kwargs) -> Optional[Type[Policy]]: - return LeelaChessZeroPolicyWrapperClass - - @override(Algorithm) - def training_step(self) -> ResultDict: - """TODO: - - Returns: - The results dict from executing the training iteration. - """ - - # Sample n MultiAgentBatches from n workers. - new_sample_batches = synchronous_parallel_sample( - worker_set=self.workers, concat=False - ) - - for batch in new_sample_batches: - # Update sampling step counters. - self._counters[NUM_ENV_STEPS_SAMPLED] += batch.env_steps() - self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps() - # Store new samples in the replay buffer - if self.local_replay_buffer is not None: - self.local_replay_buffer.add(batch) - - if self.local_replay_buffer is not None: - # Update target network every `target_network_update_freq` sample steps. - cur_ts = self._counters[ - NUM_AGENT_STEPS_SAMPLED - if self.config.count_steps_by == "agent_steps" - else NUM_ENV_STEPS_SAMPLED - ] - - if cur_ts > self.config.num_steps_sampled_before_learning_starts: - train_batch = self.local_replay_buffer.sample( - self.config.train_batch_size - ) - else: - train_batch = None - else: - train_batch = concat_samples(new_sample_batches) - - # Learn on the training batch. - # Use simple optimizer (only for multi-agent or tf-eager; all other - # cases should use the multi-GPU optimizer, even if only using 1 GPU) - train_results = {} - if train_batch is not None: - train_results = train_one_step(self, train_batch) - - # TODO: Move training steps counter update outside of `train_one_step()` method. - # # Update train step counters. - # self._counters[NUM_ENV_STEPS_TRAINED] += train_batch.env_steps() - # self._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps() - - # Update weights and global_vars - after learning on the local worker - on all - # remote workers. - global_vars = { - "timestep": self._counters[NUM_ENV_STEPS_SAMPLED], - } - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - self.workers.sync_weights(global_vars=global_vars) - - # Return all collected metrics for the iteration. - return train_results diff --git a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero_model.py b/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero_model.py deleted file mode 100644 index bfe8c7e293920..0000000000000 --- a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero_model.py +++ /dev/null @@ -1,174 +0,0 @@ -import numpy as np - -from ray.rllib.models.preprocessors import get_preprocessor -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.typing import ModelConfigDict, TensorType - -torch, nn = try_import_torch() -F = nn.functional - - -def convert_to_tensor(arr): - tensor = torch.from_numpy(np.asarray(arr)) - if tensor.dtype == torch.double: - tensor = tensor.float() - return tensor - - -class LeelaChessZeroModel(TorchModelV2, nn.Module): - def __init__( - self, - obs_space, - action_space, - num_outputs: int, - model_config: ModelConfigDict, - name: str, - ): - TorchModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - nn.Module.__init__(self) - try: - self.preprocessor = get_preprocessor(obs_space.original_space)( - obs_space.original_space - ) - except Exception: - self.preprocessor = get_preprocessor(obs_space)(obs_space) - - self.action_masking = False - self.alpha_zero_obs = True - if self.alpha_zero_obs: - self.input_channel_size = 111 - else: - self.input_channel_size = 19 - - filters = 32 - res_blocks = 3 - se_channels = 0 - policy_conv_size = 73 - policy_output_size = 4672 - self.num_outputs = 4672 - self.name = name - self.obs_space = obs_space - self.action_space = action_space - self.model_config = model_config - - self.filters = filters - self.res_blocks = res_blocks - self.se_channels = se_channels - self.policy_conv_size = policy_conv_size - self.policy_output_size = policy_output_size - self.pre_conv = nn.Conv2d( - self.input_channel_size, self.filters, 3, padding="same" - ) - self.conv1 = nn.Conv2d(self.filters, self.filters, 3, padding="same") - self.conv2 = nn.Conv2d(self.filters, self.filters, 3, padding="same") - self.pool = nn.AvgPool2d(8) - self.se1 = nn.Linear(self.filters, self.se_channels) - self.se2 = nn.Linear(self.se_channels, self.filters * 2) - self.fc_head = nn.Linear(self.filters * 64, 128) - self.value_head = nn.Linear(128, 1) - self.policy_conv1 = nn.Conv2d( - self.filters, self.policy_conv_size, 3, padding="same" - ) - self.policy_fc = nn.Linear(self.policy_conv_size * 64, self.policy_output_size) - self._value = None - - @override(TorchModelV2) - def forward(self, input_dict, state, seq_lens): - try: - obs = input_dict["obs"]["observation"] - action_mask = input_dict["obs"]["action_mask"] - except KeyError: - try: - obs = input_dict["obs"] - action_mask = input_dict["action_mask"] - except KeyError: - try: - obs = input_dict["observation"] - action_mask = input_dict["action_mask"] - except KeyError: - print(input_dict) - raise Exception("No observation in input_dict") - if self.alpha_zero_obs: - if not type(obs) == torch.Tensor: - obs = torch.from_numpy(obs.astype(np.float32)) - action_mask = torch.from_numpy(action_mask.astype(np.float32)) - try: - obs = torch.transpose(obs, 3, 1) - obs = torch.transpose(obs, 3, 2) - except IndexError: - obs = torch.reshape(obs, (1, 8, 8, self.input_channel_size)) - obs = torch.transpose(obs, 3, 1) - obs = torch.transpose(obs, 3, 2) - - x = self.pre_conv(obs) - residual = x - for i in range(self.res_blocks): - x = self.conv1(x) - x = self.conv2(x) - if self.se_channels > 0: - input = x - se = self.pool(x) - se = torch.flatten(se, 1) - se = F.relu(self.se1(se)) - se = self.se2(se) - w, b = torch.tensor_split(se, 2, dim=-1) - z = torch.sigmoid(w) - input = torch.reshape(input, (-1, self.filters, 64)) - z = torch.reshape(z, (-1, self.filters, 1)) - se = torch.mul(z, input) - se = torch.reshape(se, (-1, self.filters, 8, 8)) - se += b - x += residual - residual = x - x = torch.relu(x) - value = torch.flatten(x, 1) - value = torch.relu(self.fc_head(value)) - value = torch.tanh(self.value_head(value)) - policy = self.policy_conv1(x) - policy = torch.flatten(policy, 1) - policy = self.policy_fc(policy) - self._value = value.squeeze(1) - - if self.action_masking: - masked_policy = self.apply_action_mask(policy, action_mask) - return masked_policy, state - else: - return policy, state - - @override(TorchModelV2) - def value_function(self) -> TensorType: - return self._value - - def apply_action_mask(self, policy, action_mask): - masked_policy = torch.mul(policy, action_mask) - action_mask = torch.clamp(torch.log(action_mask), -1e10, 3.4e38) - return masked_policy + action_mask - - def get_board_evaluation(self, obs): - return self.compute_priors_and_value(obs) - - def compute_priors_and_value(self, obs): - new_obs = torch.from_numpy( - obs["observation"] - .astype(np.float32) - .reshape([1, 8, 8, self.input_channel_size]) - ) - new_action_mask = torch.from_numpy( - obs["action_mask"].astype(np.float32).reshape([1, self.num_outputs]) - ) - input_dict = {"obs": {"observation": new_obs, "action_mask": new_action_mask}} - with torch.no_grad(): - model_out = self.forward(input_dict, None, [1]) - logits, _ = model_out - value = self.value_function() - logits, value = torch.squeeze(logits), torch.squeeze(value) - priors = nn.Softmax(dim=-1)(logits) - value = nn.Tanh()(value) - - priors = priors.cpu().numpy() - value = value.cpu().numpy() - return priors, value diff --git a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero_policy.py b/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero_policy.py deleted file mode 100644 index e02769783f867..0000000000000 --- a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/leela_chess_zero_policy.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -from rllib_leela_chess_zero.leela_chess_zero.mcts import Node, RootParentNode - -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.torch_policy import TorchPolicy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY - -torch, _ = try_import_torch() - - -class LeelaChessZeroPolicy(TorchPolicy): - def __init__( - self, - observation_space, - action_space, - config, - model, - loss, - action_distribution_class, - mcts_creator, - env_creator, - **kwargs - ): - super().__init__( - observation_space, - action_space, - config, - model=model, - loss=loss, - action_distribution_class=action_distribution_class, - ) - # we maintain an env copy in the policy that is used during mcts - # simulations - self.env_creator = env_creator - self.mcts = mcts_creator() - self.env = self.env_creator() - self.obs_space = observation_space - # only used in multi policy competitive environments - self.elo = 400 - - @override(TorchPolicy) - def compute_actions( - self, - obs_batch, - state_batches=None, - prev_action_batch=None, - prev_reward_batch=None, - info_batch=None, - episodes=None, - **kwargs - ): - input_dict = {"obs": obs_batch} - if prev_action_batch is not None: - input_dict["prev_actions"] = prev_action_batch - if prev_reward_batch is not None: - input_dict["prev_rewards"] = prev_reward_batch - - return self.compute_actions_from_input_dict( - input_dict=input_dict, - episodes=episodes, - state_batches=state_batches, - ) - - @override(Policy) - def compute_actions_from_input_dict( - self, input_dict, explore=None, timestep=None, episodes=None, **kwargs - ): - with torch.no_grad(): - actions = [] - for i, episode in enumerate(episodes): - env_state = episode.user_data["current_state"][-1] - # create tree root node - obs = self.env.set_state(env_state) - tree_node = Node( - state=env_state, - obs=obs, - reward=0, - done=False, - action=None, - parent=RootParentNode(env=self.env), - mcts=self.mcts, - ) - - # run monte carlo simulations to compute the actions - # and record the tree - mcts_policy, action, tree_node = self.mcts.compute_action(tree_node) - - # record action - actions.append(action) - # store new node - episode.user_data["tree_node"] = tree_node - - # store mcts policies vectors and current tree root node - if episode.length == 0: - episode.user_data["mcts_policies"] = [mcts_policy] - else: - episode.user_data["mcts_policies"].append(mcts_policy) - break - return ( - np.array(actions), - [], - self.extra_action_out( - input_dict, kwargs.get("state_batches", []), self.model, None - ), - ) - - @override(Policy) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - # add mcts policies to sample batch - sample_batch["mcts_policies"] = np.array(episode.user_data["mcts_policies"])[ - sample_batch["t"] - ] - # final episode reward corresponds to the value (if not discounted) - # for all transitions in episode - final_reward = sample_batch["rewards"][-1] - sample_batch["value_label"] = final_reward * np.ones_like(sample_batch["t"]) - return sample_batch - - @override(TorchPolicy) - def learn_on_batch(self, postprocessed_batch): - train_batch = self._lazy_tensor_dict(postprocessed_batch) - - loss_out, policy_loss, value_loss = self._loss( - self, self.model, self.dist_class, train_batch - ) - self._optimizers[0].zero_grad() - loss_out.backward() - - grad_process_info = self.extra_grad_process(self._optimizers[0], loss_out) - self._optimizers[0].step() - - grad_info = self.extra_grad_info(train_batch) - grad_info.update(grad_process_info) - grad_info.update( - { - "total_loss": loss_out.detach().cpu().numpy(), - "policy_loss": policy_loss.detach().cpu().numpy(), - "value_loss": value_loss.detach().cpu().numpy(), - } - ) - - return {LEARNER_STATS_KEY: grad_info} diff --git a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/mcts.py b/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/mcts.py deleted file mode 100644 index b0151f02a82d9..0000000000000 --- a/rllib_contrib/leela_chess_zero/src/rllib_leela_chess_zero/leela_chess_zero/mcts.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -Mcts implementation modified from -https://github.com/brilee/python_uct/blob/master/numpy_impl.py -""" -import collections -import copy -import math - -import numpy as np - - -class Node: - def __init__( - self, action, obs, done, reward, state, mcts, parent=None, multi_agent=False - ): - self.env = parent.env - self.action = action # Action used to go to this state - - self.is_expanded = False - self.parent = parent - self.children = {} - - self.action_space_size = self.env.action_space.n - self.child_total_value = np.zeros( - [self.action_space_size], dtype=np.float32 - ) # Q - self.child_priors = np.zeros([self.action_space_size], dtype=np.float32) # P - self.child_number_visits = np.zeros( - [self.action_space_size], dtype=np.float32 - ) # N - - self.reward = reward - self.done = done - self.state = state - self.obs = obs - - current_agents = list(obs.keys()) - current_agent = current_agents[0] - for key in current_agents: - if "player_" in key: - multi_agent = True - if multi_agent: - current_agent = self.state.agent_selection - if type(self.reward) == dict: - self.reward = self.reward[current_agent] - if type(self.done) == dict: - self.done = self.done[current_agent] - if type(self.obs) == dict: - self.valid_actions = obs[current_agent]["action_mask"].astype(bool) - self.obs = obs[current_agent] - else: - self.valid_actions = obs["action_mask"].astype(bool) - self.obs = obs - - self.mcts = mcts - - self.multi_agent = multi_agent - - @property - def number_visits(self): - return self.parent.child_number_visits[self.action] - - @number_visits.setter - def number_visits(self, value): - self.parent.child_number_visits[self.action] = value - - @property - def total_value(self): - return self.parent.child_total_value[self.action] - - @total_value.setter - def total_value(self, value): - self.parent.child_total_value[self.action] = value - - def child_Q(self): - # TODO (weak todo) add "softmax" version of the Q-value - return self.child_total_value / (1 + self.child_number_visits) - - def child_U(self): - return ( - math.sqrt(self.number_visits) - * self.child_priors - / (1 + self.child_number_visits) - ) - - def best_action(self): - """ - :return: action - """ - child_score = self.child_Q() + self.mcts.c_puct * self.child_U() - masked_child_score = child_score - if self.mcts.exploit_child_value: - masked_child_score[~self.valid_actions] = -1e22 - action = np.argmax(masked_child_score) - assert self.valid_actions[action] == 1 - return action - else: - masked_child_score[~self.valid_actions] = 0 - masked_child_score[self.valid_actions] += 1 + abs( - np.min(masked_child_score) - ) - p = masked_child_score / np.sum(masked_child_score) - action = np.random.choice( - np.arange(len(masked_child_score)), - p=p, - ) - assert self.valid_actions[action] == 1 - return action - - def select(self): - current_node = self - while current_node.is_expanded: - best_action = current_node.best_action() - current_node = current_node.get_child(best_action) - return current_node - - def expand(self, child_priors): - self.is_expanded = True - self.total_value = 0 - self.parent.child_total_value[self.action] = 0 - self.child_priors = child_priors - - def get_child(self, action): - if action not in self.children: - self.env.set_state(self.state) - obs, reward, done, _, _ = self.env.step(action) - next_state = self.env.get_state() - self.children[action] = Node( - state=next_state, - action=action, - parent=self, - reward=reward, - done=done, - obs=obs, - mcts=self.mcts, - ) - return self.children[action] - - def backup(self, value): - current = self - - while current.parent is not None: - if self.mcts.turn_based_flip: - value = -value - current.number_visits += 1 - current.total_value += value - current = current.parent - - -class RootParentNode: - def __init__(self, env, state=None): - self.parent = None - self.child_total_value = collections.defaultdict(float) - self.child_number_visits = collections.defaultdict(float) - self.env = env - if state is None: - self.state = env.get_state() - else: - self.state = state - - -class MCTS: - def __init__(self, model, mcts_param): - self.model = model - self.temperature = mcts_param["temperature"] - self.dir_epsilon = mcts_param["dirichlet_epsilon"] - self.dir_noise = mcts_param["dirichlet_noise"] - self.num_sims = mcts_param["num_simulations"] - self.exploit = mcts_param["argmax_tree_policy"] - self.add_dirichlet_noise = mcts_param["add_dirichlet_noise"] - self.c_puct = mcts_param["puct_coefficient"] - self.epsilon = mcts_param["epsilon"] - self.turn_based_flip = mcts_param["turn_based_flip"] - self.exploit_child_value = mcts_param["argmax_child_value"] - - def compute_action(self, node): - initial_state = copy.deepcopy(node.state) - for _ in range(self.num_sims): - node.env.set_state(copy.deepcopy(initial_state)) - leaf = node.select() - if leaf.done: - value = -leaf.reward * 10 - else: - child_priors, value = self.model.compute_priors_and_value(leaf.obs) - if self.add_dirichlet_noise: - child_priors = (1 - self.dir_epsilon) * child_priors - child_priors += self.dir_epsilon * np.random.dirichlet( - [self.dir_noise] * child_priors.size - ) - - leaf.expand(child_priors) - leaf.backup(value) - - # Tree policy target (TPT) - tree_policy = node.child_number_visits / node.number_visits - tree_policy = tree_policy / np.max( - tree_policy - ) # to avoid overflows when computing softmax - tree_policy = np.power(tree_policy, self.temperature) - tree_policy *= node.valid_actions - tree_policy = tree_policy / np.sum(tree_policy) - epsilon_exploration = np.random.choice( - [True, False], p=[self.epsilon, 1 - self.epsilon] - ) - if self.exploit and not epsilon_exploration: - # if exploit then choose action that has the maximum - # tree policy probability - action = np.argmax(tree_policy) - else: - # otherwise sample an action according to tree policy probabilities - action = np.random.choice(np.arange(node.action_space_size), p=tree_policy) - assert node.valid_actions[action] == 1 - node.env.set_state(initial_state) - return tree_policy, action, node.children[action] diff --git a/rllib_contrib/leela_chess_zero/tests/test_leela_chess_zero.py b/rllib_contrib/leela_chess_zero/tests/test_leela_chess_zero.py deleted file mode 100644 index 67886acac20c1..0000000000000 --- a/rllib_contrib/leela_chess_zero/tests/test_leela_chess_zero.py +++ /dev/null @@ -1,52 +0,0 @@ -import unittest - -import rllib_leela_chess_zero.leela_chess_zero.leela_chess_zero as lz -from rllib_leela_chess_zero.leela_chess_zero.leela_chess_zero_model import ( - LeelaChessZeroModel, -) - -import ray -from ray.rllib.examples.env.pettingzoo_chess import MultiAgentChess -from ray.rllib.utils.test_utils import check_train_results, framework_iterator - - -class TestLeelaChessZero(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_leela_chess_zero_compilation(self): - """Test whether LeelaChessZero can be built with PyTorch frameworks.""" - config = ( - lz.LeelaChessZeroConfig() - .environment(env=MultiAgentChess) - .training( - sgd_minibatch_size=256, - train_batch_size=256, - num_sgd_iter=1, - model={"custom_model": LeelaChessZeroModel, "max_seq_len": 200}, - mcts_config={"num_simulations": 2}, - ) - .resources(num_gpus=0) - ) - num_iterations = 1 - # Only working for torch right now. - for _ in framework_iterator(config, frameworks="torch"): - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/leela_chess_zero/tuned_examples/__init__.py b/rllib_contrib/leela_chess_zero/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/leela_chess_zero/tuned_examples/multi-agent-leela-chess-zero.py b/rllib_contrib/leela_chess_zero/tuned_examples/multi-agent-leela-chess-zero.py deleted file mode 100644 index 13b85814c2f54..0000000000000 --- a/rllib_contrib/leela_chess_zero/tuned_examples/multi-agent-leela-chess-zero.py +++ /dev/null @@ -1,50 +0,0 @@ -from ray.rllib.algorithms.leela_chess_zero import LeelaChessZeroConfig -from ray.rllib.examples.env.pettingzoo_chess import MultiAgentChess -from ray.rllib.policy.policy import PolicySpec - -config = ( - LeelaChessZeroConfig() - .environment(MultiAgentChess) - .rollouts(num_rollout_workers=11) - .resources(num_gpus=1, num_cpus_per_worker=1, num_gpus_per_worker=0.0) - .framework("torch") - .multi_agent( - # 2 agents per env. - # p_0 represent a trainable policy that should get better with training - # p_1 represents a policy which avoids and/or pursues most trivial - # checkmates in 1, but doesn't learn. - policies={ - "p_0": PolicySpec( - config=LeelaChessZeroConfig.overrides( - mcts_config={ - "num_simulations": 20, - "turn_based_flip": True, - "argmax_tree_policy": True, - "argmax_child_value": True, - } - ) - ), - "p_1": PolicySpec( - config=LeelaChessZeroConfig.overrides( - mcts_config={ - "num_simulations": 3, - "epsilon": 1, - } - ) - ), - }, - # Train only the first policy. - policies_to_train=["p_0"], - policy_mapping_fn=( - lambda aid, eps, worker, **kw: "p_" - + str("0" if int(aid.split("_")[-1]) % 2 == 0 else "1") - ), - ) -) - -# this stops when -stop = { - "policy_reward_mean/p_0": 0.6, - "timesteps_total": 2000000, - "time_total_s": 180000, -} diff --git a/rllib_contrib/maddpg/BUILD b/rllib_contrib/maddpg/BUILD deleted file mode 100644 index bdbaff3656a97..0000000000000 --- a/rllib_contrib/maddpg/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# Examples - -py_test( - name = "example_two_step_game_maddpg", - main = "two_step_game.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/two_step_game.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_two_step_game_maddpg", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "tf_only"], - size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/two-step-game-maddpg.yaml"], - args = ["--dir=maddpg/tuned_examples/", "--framework=tf"] -) - -# Compilation Tests - -py_test( - name = "test_maddpg", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_maddpg.py"] -) diff --git a/rllib_contrib/maddpg/README.md b/rllib_contrib/maddpg/README.md deleted file mode 100644 index 5f29291d3307a..0000000000000 --- a/rllib_contrib/maddpg/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# MADDPG (Multi-Agent Deep Deterministic Policy Gradient) - -[MADDPG](https://arxiv.org/abs/1706.02275) is a DDPG centralized/shared critic algorithm. Code here is adapted from https://github.com/openai/maddpg to integrate with RLlib multi-agent APIs. Please check justinkterry/maddpg-rllib for examples and more information. Note that the implementation here is based on OpenAI’s, and is intended for use with the discrete MPE environments. Please also note that people typically find this method difficult to get to work, even with all applicable optimizations for their environment applied. This method should be viewed as for research purposes, and for reproducing the results of the paper introducing it. - - -## Installation - -``` -conda create -n rllib-maddpg python=3.10 -conda activate rllib-maddpg -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[MADDPG Example]() \ No newline at end of file diff --git a/rllib_contrib/maddpg/examples/two_step_game.py b/rllib_contrib/maddpg/examples/two_step_game.py deleted file mode 100644 index 22e1f4020de0b..0000000000000 --- a/rllib_contrib/maddpg/examples/two_step_game.py +++ /dev/null @@ -1,124 +0,0 @@ -# The two-step game from QMIX: https://arxiv.org/pdf/1803.11485.pdf - -import argparse -import logging - -from gymnasium.spaces import Dict, Discrete, MultiDiscrete, Tuple -from rllib_maddpg.maddpg import MADDPG, MADDPGConfig - -import ray -from ray import air, tune -from ray.rllib.env.multi_agent_env import ENV_STATE -from ray.rllib.examples.env.two_step_game import TwoStepGame -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune import register_env - -logger = logging.getLogger(__name__) - -parser = argparse.ArgumentParser() -parser.add_argument( - "--mixer", - type=str, - default="qmix", - choices=["qmix", "vdn", "none"], - help="The mixer model to use.", -) -parser.add_argument( - "--run-as-test", - action="store_true", -) - -parser.add_argument( - "--stop-timesteps", type=int, default=20000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=7.2, help="Reward at which we stop training." -) - - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init() - - grouping = { - "group_1": [0, 1], - } - obs_space = Tuple( - [ - Dict( - { - "obs": MultiDiscrete([2, 2, 2, 3]), - ENV_STATE: MultiDiscrete([2, 2, 2]), - } - ), - Dict( - { - "obs": MultiDiscrete([2, 2, 2, 3]), - ENV_STATE: MultiDiscrete([2, 2, 2]), - } - ), - ] - ) - act_space = Tuple( - [ - TwoStepGame.action_space, - TwoStepGame.action_space, - ] - ) - register_env( - "grouped_twostep", - lambda config: TwoStepGame(config).with_agent_groups( - grouping, obs_space=obs_space, act_space=act_space - ), - ) - - config = ( - MADDPGConfig() - .environment(TwoStepGame) - .framework("torch") - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources() - ) - - obs_space = Discrete(6) - act_space = TwoStepGame.action_space - ( - config.framework("tf") - .environment(env_config={"actions_are_logits": True}) - .training(num_steps_sampled_before_learning_starts=200) - .multi_agent( - policies={ - "pol1": PolicySpec( - observation_space=obs_space, - action_space=act_space, - config=config.overrides(agent_id=0), - ), - "pol2": PolicySpec( - observation_space=obs_space, - action_space=act_space, - config=config.overrides(agent_id=1), - ), - }, - policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "pol2" - if agent_id - else "pol1", - ) - ) - - stop = { - "episode_reward_mean": args.stop_reward, - "timesteps_total": args.stop_timesteps, - } - - results = tune.Tuner( - MADDPG, - run_config=air.RunConfig(stop=stop, verbose=2), - param_space=config, - ).fit() - - if args.run_as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() diff --git a/rllib_contrib/maddpg/pyproject.toml b/rllib_contrib/maddpg/pyproject.toml deleted file mode 100644 index b67761f6aeedc..0000000000000 --- a/rllib_contrib/maddpg/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-maddpg" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium==0.26.3", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "numpy<2"] diff --git a/rllib_contrib/maddpg/requirements.txt b/rllib_contrib/maddpg/requirements.txt deleted file mode 100644 index ea076922983bc..0000000000000 --- a/rllib_contrib/maddpg/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -numpy<2 diff --git a/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/__init__.py b/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/__init__.py deleted file mode 100644 index 13bdae6ee0e57..0000000000000 --- a/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from rllib_maddpg.maddpg.maddpg import MADDPG, MADDPGConfig - -from ray.tune.registry import register_trainable - -__all__ = ["MADDPGConfig", "MADDPG"] - -register_trainable("rllib-contrib-maddpg", MADDPG) diff --git a/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/maddpg.py b/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/maddpg.py deleted file mode 100644 index 4aa3eb7c9571d..0000000000000 --- a/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/maddpg.py +++ /dev/null @@ -1,322 +0,0 @@ -"""Contributed port of MADDPG from OpenAI baselines. - -The implementation has a couple assumptions: -- The number of agents is fixed and known upfront. -- Each agent is bound to a policy of the same name. -- Discrete actions are sent as logits (pre-softmax). - -For a minimal example, see rllib/examples/two_step_game.py, -and the README for how to run with the multi-agent particle envs. -""" - -import logging -from typing import List, Optional, Type - -from rllib_maddpg.maddpg.maddpg_tf_policy import MADDPGTFPolicy - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.dqn.dqn import DQN -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -class MADDPGConfig(AlgorithmConfig): - """Defines a configuration class from which a MADDPG Algorithm can be built. - - Example: - >>> from ray.rllib.algorithms.maddpg.maddpg import MADDPGConfig - >>> config = MADDPGConfig() - >>> print(config.replay_buffer_config) # doctest: +SKIP - >>> replay_config = config.replay_buffer_config.update( # doctest: +SKIP - ... { - ... "capacity": 100000, - ... "prioritized_replay_alpha": 0.8, - ... "prioritized_replay_beta": 0.45, - ... "prioritized_replay_eps": 2e-6, - ... } - ... ) - >>> config.training(replay_buffer_config=replay_config) # doctest: +SKIP - >>> config = config.resources(num_gpus=0) # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=4) # doctest: +SKIP - >>> config = config.environment("CartPole-v1") # doctest: +SKIP - >>> algo = config.build() # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.maddpg.maddpg import MADDPGConfig - >>> from ray import air - >>> from ray import tune - >>> config = MADDPGConfig() - >>> config.training(n_step=tune.grid_search([3, 5])) # doctest: +SKIP - >>> config.environment(env="CartPole-v1") # doctest: +SKIP - >>> tune.Tuner( # doctest: +SKIP - ... "MADDPG", - ... run_config=air.RunConfig(stop={"episode_reward_mean":200}), - ... param_space=config.to_dict() - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a DQNConfig instance.""" - super().__init__(algo_class=algo_class or MADDPG) - - # fmt: off - # __sphinx_doc_begin__ - # MADDPG specific config settings: - self.agent_id = None - self.use_local_critic = False - self.use_state_preprocessor = False - self.actor_hiddens = [64, 64] - self.actor_hidden_activation = "relu" - self.critic_hiddens = [64, 64] - self.critic_hidden_activation = "relu" - self.n_step = 1 - self.good_policy = "maddpg" - self.adv_policy = "maddpg" - self.replay_buffer_config = { - "type": "MultiAgentReplayBuffer", - # Specify prioritized replay by supplying a buffer type that supports - # prioritization, for example: MultiAgentPrioritizedReplayBuffer. - "prioritized_replay": DEPRECATED_VALUE, - "capacity": int(1e6), - # Force lockstep replay mode for MADDPG. - "replay_mode": "lockstep", - } - self.training_intensity = None - self.num_steps_sampled_before_learning_starts = 1024 * 25 - self.critic_lr = 1e-2 - self.actor_lr = 1e-2 - self.target_network_update_freq = 0 - self.tau = 0.01 - self.actor_feature_reg = 0.001 - self.grad_norm_clipping = 0.5 - - # Changes to Algorithm's default: - self.rollout_fragment_length = 100 - self.train_batch_size = 1024 - self.num_rollout_workers = 1 - self.min_time_s_per_iteration = 0 - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # fmt: on - # __sphinx_doc_end__ - - @override(AlgorithmConfig) - def training( - self, - *, - agent_id: Optional[str] = NotProvided, - use_local_critic: Optional[bool] = NotProvided, - use_state_preprocessor: Optional[bool] = NotProvided, - actor_hiddens: Optional[List[int]] = NotProvided, - actor_hidden_activation: Optional[str] = NotProvided, - critic_hiddens: Optional[List[int]] = NotProvided, - critic_hidden_activation: Optional[str] = NotProvided, - n_step: Optional[int] = NotProvided, - good_policy: Optional[str] = NotProvided, - adv_policy: Optional[str] = NotProvided, - replay_buffer_config: Optional[dict] = NotProvided, - training_intensity: Optional[float] = NotProvided, - num_steps_sampled_before_learning_starts: Optional[int] = NotProvided, - critic_lr: Optional[float] = NotProvided, - actor_lr: Optional[float] = NotProvided, - target_network_update_freq: Optional[int] = NotProvided, - tau: Optional[float] = NotProvided, - actor_feature_reg: Optional[float] = NotProvided, - grad_norm_clipping: Optional[float] = NotProvided, - **kwargs, - ) -> "MADDPGConfig": - """Sets the training related configuration. - - Args: - agent_id: ID of the agent controlled by this policy. - use_local_critic: Use a local critic for this policy. - use_state_preprocessor: Apply a state preprocessor with spec given by the - "model" config option (like other RL algorithms). This is mostly useful - if you have a weird observation shape, like an image. Disabled by - default. - actor_hiddens: Postprocess the policy network model output with these hidden - layers. If `use_state_preprocessor` is False, then these will be the - *only* hidden layers in the network. - actor_hidden_activation: Hidden layers activation of the postprocessing - stage of the policy network. - critic_hiddens: Postprocess the critic network model output with these - hidden layers; again, if use_state_preprocessor is True, then the state - will be preprocessed by the model specified with the "model" config - option first. - critic_hidden_activation: Hidden layers activation of the postprocessing - state of the critic. - n_step: N-step for Q-learning. - good_policy: Algorithm for good policies. - adv_policy: Algorithm for adversary policies. - replay_buffer_config: Replay buffer config. - Examples: - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentReplayBuffer", - "capacity": 50000, - "replay_sequence_length": 1, - } - - OR - - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 50000, - "prioritized_replay_alpha": 0.6, - "prioritized_replay_beta": 0.4, - "prioritized_replay_eps": 1e-6, - "replay_sequence_length": 1, - } - - Where - - prioritized_replay_alpha: Alpha parameter controls the degree of - prioritization in the buffer. In other words, when a buffer sample has - a higher temporal-difference error, with how much more probability - should it drawn to use to update the parametrized Q-network. 0.0 - corresponds to uniform probability. Setting much above 1.0 may quickly - result as the sampling distribution could become heavily “pointy” with - low entropy. - prioritized_replay_beta: Beta parameter controls the degree of - importance sampling which suppresses the influence of gradient updates - from samples that have higher probability of being sampled via alpha - parameter and the temporal-difference error. - prioritized_replay_eps: Epsilon parameter sets the baseline probability - for sampling so that when the temporal-difference error of a sample is - zero, there is still a chance of drawing the sample. - training_intensity: If set, this will fix the ratio of replayed from a - buffer and learned on timesteps to sampled from an environment and - stored in the replay buffer timesteps. Otherwise, the replay will - proceed at the native ratio determined by - `(train_batch_size / rollout_fragment_length)`. - num_steps_sampled_before_learning_starts: Number of timesteps to collect - from rollout workers before we start sampling from replay buffers for - learning. Whether we count this in agent steps or environment steps - depends on config.multi_agent(count_steps_by=..). - critic_lr: Learning rate for the critic (Q-function) optimizer. - actor_lr: Learning rate for the actor (policy) optimizer. - target_network_update_freq: Update the target network every - `target_network_update_freq` sample steps. - tau: Update the target by \tau * policy + (1-\tau) * target_policy. - actor_feature_reg: Weights for feature regularization for the actor. - grad_norm_clipping: If not None, clip gradients during optimization at this - value. - - Returns: - This updated AlgorithmConfig object. - """ - - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if agent_id is not NotProvided: - self.agent_id = agent_id - if use_local_critic is not NotProvided: - self.use_local_critic = use_local_critic - if use_state_preprocessor is not NotProvided: - self.use_state_preprocessor = use_state_preprocessor - if actor_hiddens is not NotProvided: - self.actor_hiddens = actor_hiddens - if actor_hidden_activation is not NotProvided: - self.actor_hidden_activation = actor_hidden_activation - if critic_hiddens is not NotProvided: - self.critic_hiddens = critic_hiddens - if critic_hidden_activation is not NotProvided: - self.critic_hidden_activation = critic_hidden_activation - if n_step is not NotProvided: - self.n_step = n_step - if good_policy is not NotProvided: - self.good_policy = good_policy - if adv_policy is not NotProvided: - self.adv_policy = adv_policy - if replay_buffer_config is not NotProvided: - self.replay_buffer_config = replay_buffer_config - if training_intensity is not NotProvided: - self.training_intensity = training_intensity - if num_steps_sampled_before_learning_starts is not NotProvided: - self.num_steps_sampled_before_learning_starts = ( - num_steps_sampled_before_learning_starts - ) - if critic_lr is not NotProvided: - self.critic_lr = critic_lr - if actor_lr is not NotProvided: - self.actor_lr = actor_lr - if target_network_update_freq is not NotProvided: - self.target_network_update_freq = target_network_update_freq - if tau is not NotProvided: - self.tau = tau - if actor_feature_reg is not NotProvided: - self.actor_feature_reg = actor_feature_reg - if grad_norm_clipping is not NotProvided: - self.grad_norm_clipping = grad_norm_clipping - - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - """Adds the `before_learn_on_batch` hook to the config. - - This hook is called explicitly prior to `train_one_step()` in the - `training_step()` methods of DQN and APEX. - """ - # Call super's validation method. - super().validate() - - def f(batch, workers, config): - policies = dict( - workers.local_worker().foreach_policy_to_train(lambda p, i: (i, p)) - ) - return before_learn_on_batch(batch, policies, config["train_batch_size"]) - - self.before_learn_on_batch = f - - -def before_learn_on_batch(multi_agent_batch, policies, train_batch_size): - samples = {} - - # Modify keys. - for pid, p in policies.items(): - i = p.config["agent_id"] - keys = multi_agent_batch.policy_batches[pid].keys() - keys = ["_".join([k, str(i)]) for k in keys] - samples.update(dict(zip(keys, multi_agent_batch.policy_batches[pid].values()))) - - # Make ops and feed_dict to get "new_obs" from target action sampler. - new_obs_ph_n = [p.new_obs_ph for p in policies.values()] - new_obs_n = list() - for k, v in samples.items(): - if "new_obs" in k: - new_obs_n.append(v) - - for i, p in enumerate(policies.values()): - feed_dict = {new_obs_ph_n[i]: new_obs_n[i]} - new_act = p.get_session().run(p.target_act_sampler, feed_dict) - samples.update({"new_actions_%d" % i: new_act}) - - # Share samples among agents. - policy_batches = {pid: SampleBatch(samples) for pid in policies.keys()} - return MultiAgentBatch(policy_batches, train_batch_size) - - -class MADDPG(DQN): - @classmethod - @override(DQN) - def get_default_config(cls) -> AlgorithmConfig: - return MADDPGConfig() - - @classmethod - @override(DQN) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - return MADDPGTFPolicy diff --git a/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/maddpg_tf_policy.py b/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/maddpg_tf_policy.py deleted file mode 100644 index ccec726721a2b..0000000000000 --- a/rllib_contrib/maddpg/src/rllib_maddpg/maddpg/maddpg_tf_policy.py +++ /dev/null @@ -1,400 +0,0 @@ -import logging - -import numpy as np -from gymnasium.spaces import Box, Discrete - -from ray.rllib.algorithms.dqn.dqn_tf_policy import minimize_and_clip -from ray.rllib.evaluation.postprocessing import adjust_nstep -from ray.rllib.models import ModelCatalog -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_policy import TFPolicy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.utils.framework import try_import_tf, try_import_tfp -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY - -logger = logging.getLogger(__name__) - -tf1, tf, tfv = try_import_tf() -tfp = try_import_tfp() - - -class MADDPGPostprocessing: - """Implements agentwise termination signal and n-step learning.""" - - @override(Policy) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - # FIXME: Get done from info is required since agentwise done is not - # supported now. - sample_batch[SampleBatch.TERMINATEDS] = self.get_done_from_info( - sample_batch[SampleBatch.INFOS] - ) - - # N-step Q adjustments - if self.config["n_step"] > 1: - adjust_nstep(self.config["n_step"], self.config["gamma"], sample_batch) - - return sample_batch - - -class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy): - def __init__(self, obs_space, act_space, config): - # _____ Initial Configuration - self.config = config - self.global_step = tf1.train.get_or_create_global_step() - - # FIXME: Get done from info is required since agentwise done is not - # supported now. - self.get_done_from_info = np.vectorize(lambda info: info.get("done", False)) - - agent_id = config["agent_id"] - if agent_id is None: - raise ValueError("Must set `agent_id` in the policy config.") - if type(agent_id) is not int: - raise ValueError("Agent ids must be integers for MADDPG.") - - # _____ Environment Setting - def _make_continuous_space(space): - if isinstance(space, Box): - return space - elif isinstance(space, Discrete): - return Box(low=np.zeros((space.n,)), high=np.ones((space.n,))) - else: - raise UnsupportedSpaceException( - "Space {} is not supported.".format(space) - ) - - from ray.rllib.algorithms.maddpg.maddpg import MADDPGConfig - - policies, _ = ( - MADDPGConfig.from_dict(config) - .environment(observation_space=obs_space, action_space=act_space) - .get_multi_agent_setup() - ) - obs_space_n = [ - _make_continuous_space(spec.observation_space) - for _, spec in policies.items() - ] - act_space_n = [ - _make_continuous_space(spec.action_space) for _, spec in policies.items() - ] - - # _____ Placeholders - # Placeholders for policy evaluation and updates - def _make_ph_n(space_n, name=""): - return [ - tf1.placeholder( - tf.float32, shape=(None,) + space.shape, name=name + "_%d" % i - ) - for i, space in enumerate(space_n) - ] - - obs_ph_n = _make_ph_n(obs_space_n, SampleBatch.OBS) - act_ph_n = _make_ph_n(act_space_n, SampleBatch.ACTIONS) - new_obs_ph_n = _make_ph_n(obs_space_n, SampleBatch.NEXT_OBS) - new_act_ph_n = _make_ph_n(act_space_n, "new_actions") - rew_ph = tf1.placeholder( - tf.float32, shape=None, name="rewards_{}".format(agent_id) - ) - done_ph = tf1.placeholder( - tf.float32, shape=None, name="terminateds_{}".format(agent_id) - ) - - if config["use_local_critic"]: - obs_space_n, act_space_n = [obs_space_n[agent_id]], [act_space_n[agent_id]] - obs_ph_n, act_ph_n = [obs_ph_n[agent_id]], [act_ph_n[agent_id]] - new_obs_ph_n, new_act_ph_n = [new_obs_ph_n[agent_id]], [ - new_act_ph_n[agent_id] - ] - agent_id = 0 - - # _____ Value Network - # Build critic network for t. - critic, _, critic_model_n, critic_vars = self._build_critic_network( - obs_ph_n, - act_ph_n, - obs_space_n, - act_space_n, - config["use_state_preprocessor"], - config["critic_hiddens"], - getattr(tf.nn, config["critic_hidden_activation"]), - scope="critic", - ) - - # Build critic network for t + 1. - target_critic, _, _, target_critic_vars = self._build_critic_network( - new_obs_ph_n, - new_act_ph_n, - obs_space_n, - act_space_n, - config["use_state_preprocessor"], - config["critic_hiddens"], - getattr(tf.nn, config["critic_hidden_activation"]), - scope="target_critic", - ) - - # Build critic loss. - td_error = tf.subtract( - tf.stop_gradient( - rew_ph - + (1.0 - done_ph) - * (config["gamma"] ** config["n_step"]) - * target_critic[:, 0] - ), - critic[:, 0], - ) - critic_loss = tf.reduce_mean(td_error**2) - - # _____ Policy Network - # Build actor network for t. - act_sampler, actor_feature, actor_model, actor_vars = self._build_actor_network( - obs_ph_n[agent_id], - obs_space_n[agent_id], - act_space_n[agent_id], - config["use_state_preprocessor"], - config["actor_hiddens"], - getattr(tf.nn, config["actor_hidden_activation"]), - scope="actor", - ) - - # Build actor network for t + 1. - self.new_obs_ph = new_obs_ph_n[agent_id] - self.target_act_sampler, _, _, target_actor_vars = self._build_actor_network( - self.new_obs_ph, - obs_space_n[agent_id], - act_space_n[agent_id], - config["use_state_preprocessor"], - config["actor_hiddens"], - getattr(tf.nn, config["actor_hidden_activation"]), - scope="target_actor", - ) - - # Build actor loss. - act_n = act_ph_n.copy() - act_n[agent_id] = act_sampler - critic, _, _, _ = self._build_critic_network( - obs_ph_n, - act_n, - obs_space_n, - act_space_n, - config["use_state_preprocessor"], - config["critic_hiddens"], - getattr(tf.nn, config["critic_hidden_activation"]), - scope="critic", - ) - actor_loss = -tf.reduce_mean(critic) - if config["actor_feature_reg"] is not None: - actor_loss += config["actor_feature_reg"] * tf.reduce_mean( - actor_feature**2 - ) - - # _____ Losses - self.losses = {"critic": critic_loss, "actor": actor_loss} - - # _____ Optimizers - self.optimizers = { - "critic": tf1.train.AdamOptimizer(config["critic_lr"]), - "actor": tf1.train.AdamOptimizer(config["actor_lr"]), - } - - # _____ Build variable update ops. - self.tau = tf1.placeholder_with_default(config["tau"], shape=(), name="tau") - - def _make_target_update_op(vs, target_vs, tau): - return [ - target_v.assign(tau * v + (1.0 - tau) * target_v) - for v, target_v in zip(vs, target_vs) - ] - - self.update_target_vars = _make_target_update_op( - critic_vars + actor_vars, target_critic_vars + target_actor_vars, self.tau - ) - - def _make_set_weight_op(variables): - vs = list() - for v in variables.values(): - vs += v - phs = [ - tf1.placeholder( - tf.float32, shape=v.get_shape(), name=v.name.split(":")[0] + "_ph" - ) - for v in vs - ] - return tf.group(*[v.assign(ph) for v, ph in zip(vs, phs)]), phs - - self.vars = { - "critic": critic_vars, - "actor": actor_vars, - "target_critic": target_critic_vars, - "target_actor": target_actor_vars, - } - self.update_vars, self.vars_ph = _make_set_weight_op(self.vars) - - # _____ TensorFlow Initialization - - sess = tf1.get_default_session() - assert sess - - def _make_loss_inputs(placeholders): - return [(ph.name.split("/")[-1].split(":")[0], ph) for ph in placeholders] - - loss_inputs = _make_loss_inputs( - obs_ph_n + act_ph_n + new_obs_ph_n + new_act_ph_n + [rew_ph, done_ph] - ) - - TFPolicy.__init__( - self, - obs_space, - act_space, - config=config, - sess=sess, - obs_input=obs_ph_n[agent_id], - sampled_action=act_sampler, - loss=actor_loss + critic_loss, - loss_inputs=loss_inputs, - dist_inputs=actor_feature, - ) - - del self.view_requirements["prev_actions"] - del self.view_requirements["prev_rewards"] - - self.get_session().run(tf1.global_variables_initializer()) - - # Hard initial update - self.update_target(1.0) - - @override(TFPolicy) - def optimizer(self): - return None - - @override(TFPolicy) - def gradients(self, optimizer, loss): - self.gvs = { - k: minimize_and_clip( - optimizer, - self.losses[k], - self.vars[k], - self.config["grad_norm_clipping"], - ) - for k, optimizer in self.optimizers.items() - } - return self.gvs["critic"] + self.gvs["actor"] - - @override(TFPolicy) - def build_apply_op(self, optimizer, grads_and_vars): - critic_apply_op = self.optimizers["critic"].apply_gradients(self.gvs["critic"]) - - with tf1.control_dependencies([tf1.assign_add(self.global_step, 1)]): - with tf1.control_dependencies([critic_apply_op]): - actor_apply_op = self.optimizers["actor"].apply_gradients( - self.gvs["actor"] - ) - - return actor_apply_op - - @override(TFPolicy) - def extra_compute_action_feed_dict(self): - return {} - - @override(TFPolicy) - def extra_compute_grad_fetches(self): - return {LEARNER_STATS_KEY: {}} - - @override(TFPolicy) - def get_weights(self): - var_list = [] - for var in self.vars.values(): - var_list += var - return {"_state": self.get_session().run(var_list)} - - @override(TFPolicy) - def set_weights(self, weights): - self.get_session().run( - self.update_vars, feed_dict=dict(zip(self.vars_ph, weights["_state"])) - ) - - @override(Policy) - def get_state(self): - return TFPolicy.get_state(self) - - @override(Policy) - def set_state(self, state): - TFPolicy.set_state(self, state) - - def _build_critic_network( - self, - obs_n, - act_n, - obs_space_n, - act_space_n, - use_state_preprocessor, - hiddens, - activation=None, - scope=None, - ): - with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope: - if use_state_preprocessor: - model_n = [ - ModelCatalog.get_model_v2( - obs_space, - act_space, - 1, - self.config["model"], - ) - for obs, obs_space, act_space in zip( - obs_n, obs_space_n, act_space_n - ) - ] - out_n = [model.last_layer for model in model_n] - out = tf.concat(out_n + act_n, axis=1) - else: - model_n = [None] * len(obs_n) - out = tf.concat(obs_n + act_n, axis=1) - - for hidden in hiddens: - out = tf1.layers.dense(out, units=hidden, activation=activation) - feature = out - out = tf1.layers.dense(feature, units=1, activation=None) - - return out, feature, model_n, tf1.global_variables(scope.name) - - def _build_actor_network( - self, - obs, - obs_space, - act_space, - use_state_preprocessor, - hiddens, - activation=None, - scope=None, - ): - with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope: - if use_state_preprocessor: - model = ModelCatalog.get_model_v2( - obs_space, - act_space, - 1, - self.config["model"], - ) - out = model.last_layer - else: - model = None - out = obs - - for hidden in hiddens: - out = tf1.layers.dense(out, units=hidden, activation=activation) - feature = tf1.layers.dense(out, units=act_space.shape[0], activation=None) - sampler = tfp.distributions.RelaxedOneHotCategorical( - temperature=1.0, logits=feature - ).sample() - - return sampler, feature, model, tf1.global_variables(scope.name) - - def update_target(self, tau=None): - if tau is not None: - self.get_session().run(self.update_target_vars, {self.tau: tau}) - else: - self.get_session().run(self.update_target_vars) diff --git a/rllib_contrib/maddpg/tests/test_maddpg.py b/rllib_contrib/maddpg/tests/test_maddpg.py deleted file mode 100644 index c8e393c35f8ce..0000000000000 --- a/rllib_contrib/maddpg/tests/test_maddpg.py +++ /dev/null @@ -1,62 +0,0 @@ -import unittest - -import rllib_maddpg.maddpg.maddpg as maddpg - -import ray -from ray.rllib.examples.env.two_step_game import TwoStepGame -from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.test_utils import check_train_results, framework_iterator - - -class TestMADDPG(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_maddpg_compilation(self): - """Test whether MADDPG can be built with all frameworks.""" - config = ( - maddpg.MADDPGConfig() - .environment( - env=TwoStepGame, - env_config={ - "actions_are_logits": True, - }, - ) - .multi_agent( - policies={ - "pol1": PolicySpec( - config=maddpg.MADDPGConfig.overrides(agent_id=0), - ), - "pol2": PolicySpec( - config=maddpg.MADDPGConfig.overrides(agent_id=1), - ), - }, - policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "pol2" - if agent_id - else "pol1", - ) - ) - - num_iterations = 1 - - # Only working for tf right now. - for _ in framework_iterator(config, frameworks="tf"): - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/maddpg/tuned_examples/__init__.py b/rllib_contrib/maddpg/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/maddpg/tuned_examples/two-step-game-maddpg.yaml b/rllib_contrib/maddpg/tuned_examples/two-step-game-maddpg.yaml deleted file mode 100644 index ce63663d0ef2f..0000000000000 --- a/rllib_contrib/maddpg/tuned_examples/two-step-game-maddpg.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -two-step-game-maddpg: - env: ray.rllib.examples.env.two_step_game.TwoStepGame - run: MADDPG - stop: - sampler_results/episode_reward_mean: 7.2 - timesteps_total: 20000 - config: - # MADDPG only supports tf for now. - framework: torch - - env_config: - env_config: - actions_are_logits: true - - num_steps_sampled_before_learning_starts: 200 - - multiagent: - policies: - p0: - - null - - null - - null - - { - agent_id: 0 - } - p1: - - null - - null - - null - - { - agent_id: 1 - } - # YAML-capable policy_mapping_fn definition via providing a callable class here. - policy_mapping_fn: - type: ray.rllib.examples.multi_agent_and_self_play.policy_mapping_fn.PolicyMappingFn diff --git a/rllib_contrib/maml/BUILD b/rllib_contrib/maml/BUILD deleted file mode 100644 index 08d7109ad49a4..0000000000000 --- a/rllib_contrib/maml/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -# Examples - -py_test( - name = "example_cartpole_mass_maml", - main = "cartpole_mass_maml.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/cartpole_mass_maml.py"], - args = ["--run-as-test"] -) - -# Learning Tests - - -# Compilation Tests - -py_test( - name = "test_maml", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_maml.py"] -) diff --git a/rllib_contrib/maml/README.md b/rllib_contrib/maml/README.md deleted file mode 100644 index 694ef0fcb502b..0000000000000 --- a/rllib_contrib/maml/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# MAML (Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks) - -[MAML](https://arxiv.org/abs/1703.03400) is an on-policy meta RL algorithm. Unlike standard RL algorithms, which aim to maximize the sum of rewards into the future for a single task (e.g. HalfCheetah), meta RL algorithms seek to maximize the sum of rewards for *a given distribution of tasks*. - -On a high level, MAML seeks to learn quick adaptation across different tasks (e.g. different velocities for HalfCheetah). Quick adaptation is defined by the number of gradient steps it takes to adapt. MAML aims to maximize the RL objective for each task after `X` gradient steps. Doing this requires partitioning the algorithm into two steps. The first step is data collection. This involves collecting data for each task for each step of adaptation (from `1, 2, ..., X`). The second step is the meta-update step. This second step takes all the aggregated ddata from the first step and computes the meta-gradient. - -Code here is adapted from https://github.com/jonasrothfuss, which outperforms vanilla MAML and avoids computation of the higher order gradients during the meta-update step. MAML is evaluated on custom environments that are described in greater detail here. - -MAML uses additional metrics to measure performance; episode_reward_mean measures the agent’s returns before adaptation, episode_reward_mean_adapt_N measures the agent’s returns after N gradient steps of inner adaptation, and adaptation_delta measures the difference in performance before and after adaptation. - - -## Installation - -``` -conda create -n rllib-maml python=3.10 -conda activate rllib-maml -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[MAML Example](examples/cartpole_mass_maml.py) \ No newline at end of file diff --git a/rllib_contrib/maml/examples/cartpole_mass_maml.py b/rllib_contrib/maml/examples/cartpole_mass_maml.py deleted file mode 100644 index 19584456d4119..0000000000000 --- a/rllib_contrib/maml/examples/cartpole_mass_maml.py +++ /dev/null @@ -1,52 +0,0 @@ -from gymnasium.wrappers import TimeLimit -from rllib_maml.maml import MAML, MAMLConfig - -import ray -from ray import air, tune -from ray.rllib.examples.env.cartpole_mass import CartPoleMassEnv -from ray.tune.registry import register_env - -if __name__ == "__main__": - ray.init() - register_env( - "cartpole", - lambda env_cfg: TimeLimit(CartPoleMassEnv(), max_episode_steps=200), - ) - - rollout_fragment_length = 32 - - config = ( - MAMLConfig() - .rollouts( - num_rollout_workers=4, rollout_fragment_length=rollout_fragment_length - ) - .framework("torch") - .environment("cartpole", clip_actions=False) - .training( - inner_adaptation_steps=1, - maml_optimizer_steps=5, - gamma=0.99, - lambda_=1.0, - lr=0.001, - vf_loss_coeff=0.5, - inner_lr=0.03, - use_meta_env=False, - clip_param=0.3, - kl_target=0.01, - kl_coeff=0.001, - model=dict(fcnet_hiddens=[64, 64]), - train_batch_size=rollout_fragment_length, - ) - ) - - num_iterations = 5 - - tuner = tune.Tuner( - MAML, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={"training_iteration": num_iterations}, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() diff --git a/rllib_contrib/maml/pyproject.toml b/rllib_contrib/maml/pyproject.toml deleted file mode 100644 index 4ab4de55883bc..0000000000000 --- a/rllib_contrib/maml/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-maml" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium[mujoco]==0.26.3", "higher", "ray[rllib]==2.3.1"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/maml/requirements.txt b/rllib_contrib/maml/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/maml/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/maml/src/rllib_maml/__init__.py b/rllib_contrib/maml/src/rllib_maml/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/maml/src/rllib_maml/envs/__init__.py b/rllib_contrib/maml/src/rllib_maml/envs/__init__.py deleted file mode 100644 index 1796db67d13e0..0000000000000 --- a/rllib_contrib/maml/src/rllib_maml/envs/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright 2023-onwards Anyscale, Inc. The use of this library is subject to the -# included LICENSE file. -from rllib_maml.envs.ant_rand_goal import AntRandGoalEnv -from rllib_maml.envs.cartpole_mass import CartPoleMassEnv -from rllib_maml.envs.pendulum_mass import PendulumMassEnv - -__all__ = [ - "AntRandGoalEnv", - "CartPoleMassEnv", - "PendulumMassEnv", -] diff --git a/rllib_contrib/maml/src/rllib_maml/envs/ant_rand_goal.py b/rllib_contrib/maml/src/rllib_maml/envs/ant_rand_goal.py deleted file mode 100644 index 5dd2f3c8e0265..0000000000000 --- a/rllib_contrib/maml/src/rllib_maml/envs/ant_rand_goal.py +++ /dev/null @@ -1,86 +0,0 @@ -import numpy as np -from gymnasium.envs.mujoco.mujoco_env import MujocoEnv -from gymnasium.utils import EzPickle - -from ray.rllib.env.apis.task_settable_env import TaskSettableEnv - - -class AntRandGoalEnv(EzPickle, MujocoEnv, TaskSettableEnv): - """Ant Environment that randomizes goals as tasks - - Goals are randomly sampled 2D positions - """ - - def __init__(self): - self.set_task(self.sample_tasks(1)[0]) - MujocoEnv.__init__(self, "ant.xml", 5) - EzPickle.__init__(self) - - def sample_tasks(self, n_tasks): - # Samples a goal position (2x1 position ector) - a = np.random.random(n_tasks) * 2 * np.pi - r = 3 * np.random.random(n_tasks) ** 0.5 - return np.stack((r * np.cos(a), r * np.sin(a)), axis=-1) - - def set_task(self, task): - """ - Args: - task: task of the meta-learning environment - """ - self.goal_pos = task - - def get_task(self): - """ - Returns: - task: task of the meta-learning environment - """ - return self.goal_pos - - def step(self, a): - self.do_simulation(a, self.frame_skip) - xposafter = self.get_body_com("torso") - goal_reward = -np.sum( - np.abs(xposafter[:2] - self.goal_pos) - ) # make it happy, not suicidal - ctrl_cost = 0.1 * np.square(a).sum() - contact_cost = ( - 0.5 * 1e-3 * np.sum(np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) - ) - # survive_reward = 1.0 - survive_reward = 0.0 - reward = goal_reward - ctrl_cost - contact_cost + survive_reward - # notdone = np.isfinite(state).all() and 1.0 >= state[2] >= 0. - # done = not notdone - done = False - ob = self._get_obs() - return ( - ob, - reward, - done, - dict( - reward_forward=goal_reward, - reward_ctrl=-ctrl_cost, - reward_contact=-contact_cost, - reward_survive=survive_reward, - ), - ) - - def _get_obs(self): - return np.concatenate( - [ - self.sim.data.qpos.flat, - self.sim.data.qvel.flat, - np.clip(self.sim.data.cfrc_ext, -1, 1).flat, - ] - ) - - def reset_model(self): - qpos = self.init_qpos + self.np_random.uniform( - size=self.model.nq, low=-0.1, high=0.1 - ) - qvel = self.init_qvel + self.np_random.randn(self.model.nv) * 0.1 - self.set_state(qpos, qvel) - return self._get_obs() - - def viewer_setup(self): - self.viewer.cam.distance = self.model.stat.extent * 0.5 diff --git a/rllib_contrib/maml/src/rllib_maml/envs/cartpole_mass.py b/rllib_contrib/maml/src/rllib_maml/envs/cartpole_mass.py deleted file mode 100644 index bfd481402eb7c..0000000000000 --- a/rllib_contrib/maml/src/rllib_maml/envs/cartpole_mass.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np -from gymnasium.envs.classic_control.cartpole import CartPoleEnv -from gymnasium.utils import EzPickle - -from ray.rllib.env.apis.task_settable_env import TaskSettableEnv - - -class CartPoleMassEnv(CartPoleEnv, EzPickle, TaskSettableEnv): - """CartPoleMassEnv varies the weights of the cart and the pole.""" - - def sample_tasks(self, n_tasks): - # Sample new cart- and pole masses (random floats between 0.5 and 2.0 - # (cart) and between 0.05 and 0.2 (pole)). - cart_masses = np.random.uniform(low=0.5, high=2.0, size=(n_tasks, 1)) - pole_masses = np.random.uniform(low=0.05, high=0.2, size=(n_tasks, 1)) - return np.concatenate([cart_masses, pole_masses], axis=-1) - - def set_task(self, task): - """ - Args: - task (Tuple[float]): Masses of the cart and the pole. - """ - self.masscart = task[0] - self.masspole = task[1] - - def get_task(self): - """ - Returns: - Tuple[float]: The current mass of the cart- and pole. - """ - return np.array([self.masscart, self.masspole]) diff --git a/rllib_contrib/maml/src/rllib_maml/envs/pendulum_mass.py b/rllib_contrib/maml/src/rllib_maml/envs/pendulum_mass.py deleted file mode 100644 index 2b4abdf20107e..0000000000000 --- a/rllib_contrib/maml/src/rllib_maml/envs/pendulum_mass.py +++ /dev/null @@ -1,33 +0,0 @@ -import numpy as np -from gymnasium.envs.classic_control.pendulum import PendulumEnv -from gymnasium.utils import EzPickle - -from ray.rllib.env.apis.task_settable_env import TaskSettableEnv - - -class PendulumMassEnv(PendulumEnv, EzPickle, TaskSettableEnv): - """PendulumMassEnv varies the weight of the pendulum - - Tasks are defined to be weight uniformly sampled between [0.5,2] - """ - - def sample_tasks(self, n_tasks): - # Sample new pendulum masses (random floats between 0.5 and 2). - return np.random.uniform(low=0.5, high=2.0, size=(n_tasks,)) - - def set_task(self, task): - """ - Args: - task: Task of the meta-learning environment (here: mass of - the pendulum). - """ - # self.m is the mass property of the pendulum. - self.m = task - - def get_task(self): - """ - Returns: - float: The current mass of the pendulum (self.m in the PendulumEnv - object). - """ - return self.m diff --git a/rllib_contrib/maml/src/rllib_maml/maml/__init__.py b/rllib_contrib/maml/src/rllib_maml/maml/__init__.py deleted file mode 100644 index 1ec07956fabd3..0000000000000 --- a/rllib_contrib/maml/src/rllib_maml/maml/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright 2023-onwards Anyscale, Inc. The use of this library is subject to the -# included LICENSE file. -from rllib_maml.maml.maml import MAML, MAMLConfig - -from ray.tune.registry import register_trainable - -__all__ = [ - "MAML", - "MAMLConfig", -] - -register_trainable("rllib-contrib-maml", MAML) diff --git a/rllib_contrib/maml/src/rllib_maml/maml/maml.py b/rllib_contrib/maml/src/rllib_maml/maml/maml.py deleted file mode 100644 index e03a7ff3f6caf..0000000000000 --- a/rllib_contrib/maml/src/rllib_maml/maml/maml.py +++ /dev/null @@ -1,388 +0,0 @@ -import logging -from typing import Optional, Type - -import numpy as np - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.evaluation.metrics import collect_metrics, get_learner_stats -from ray.rllib.evaluation.worker_set import WorkerSet -from ray.rllib.execution.common import ( - STEPS_SAMPLED_COUNTER, - STEPS_TRAINED_COUNTER, - STEPS_TRAINED_THIS_ITER_COUNTER, - _get_shared_metrics, -) -from ray.rllib.execution.metric_ops import CollectMetrics -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import ( - concat_samples, - convert_ma_batch_to_sample_batch, -) -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO -from ray.rllib.utils.sgd import standardized -from ray.util.iter import LocalIterator, from_actors - -logger = logging.getLogger(__name__) - - -class MAMLConfig(AlgorithmConfig): - """Defines a configuration class from which a MAML Algorithm can be built. - - Example: - >>> from ray.rllib.algorithms.maml import MAMLConfig - >>> config = MAMLConfig().training(use_gae=False).resources(num_gpus=1) - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.maml import MAMLConfig - >>> from ray import air - >>> from ray import tune - >>> config = MAMLConfig() - >>> # Print out some default values. - >>> print(config.lr) # doctest: +SKIP - >>> # Update the config object. - >>> config = config.training( # doctest: +SKIP - ... grad_clip=tune.grid_search([10.0, 40.0])) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "MAML", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a PGConfig instance.""" - super().__init__(algo_class=algo_class or MAML) - - # fmt: off - # __sphinx_doc_begin__ - # MAML-specific config settings. - self.use_gae = True - self.lambda_ = 1.0 - self.kl_coeff = 0.0005 - self.vf_loss_coeff = 0.5 - self.entropy_coeff = 0.0 - self.clip_param = 0.3 - self.vf_clip_param = 10.0 - self.grad_clip = None - self.kl_target = 0.01 - self.inner_adaptation_steps = 1 - self.maml_optimizer_steps = 5 - self.inner_lr = 0.1 - self.use_meta_env = True - - # Override some of AlgorithmConfig's default values with MAML-specific values. - self.num_rollout_workers = 2 - self.rollout_fragment_length = 200 - self.create_env_on_local_worker = True - self.lr = 1e-3 - - # Share layers for value function. - self.model.update({ - "vf_share_layers": False, - }) - - self.batch_mode = "complete_episodes" - self._disable_execution_plan_api = False - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # __sphinx_doc_end__ - # fmt: on - - # Deprecated keys: - self.vf_share_layers = DEPRECATED_VALUE - - def training( - self, - *, - use_gae: Optional[bool] = NotProvided, - lambda_: Optional[float] = NotProvided, - kl_coeff: Optional[float] = NotProvided, - vf_loss_coeff: Optional[float] = NotProvided, - entropy_coeff: Optional[float] = NotProvided, - clip_param: Optional[float] = NotProvided, - vf_clip_param: Optional[float] = NotProvided, - grad_clip: Optional[float] = NotProvided, - kl_target: Optional[float] = NotProvided, - inner_adaptation_steps: Optional[int] = NotProvided, - maml_optimizer_steps: Optional[int] = NotProvided, - inner_lr: Optional[float] = NotProvided, - use_meta_env: Optional[bool] = NotProvided, - **kwargs, - ) -> "MAMLConfig": - """Sets the training related configuration. - - Args: - use_gae: If true, use the Generalized Advantage Estimator (GAE) - with a value function, see https://arxiv.org/pdf/1506.02438.pdf. - lambda_: The GAE (lambda) parameter. - kl_coeff: Initial coefficient for KL divergence. - vf_loss_coeff: Coefficient of the value function loss. - entropy_coeff: Coefficient of the entropy regularizer. - clip_param: PPO clip parameter. - vf_clip_param: Clip param for the value function. Note that this is - sensitive to the scale of the rewards. If your expected V is large, - increase this. - grad_clip: If specified, clip the global norm of gradients by this amount. - kl_target: Target value for KL divergence. - inner_adaptation_steps: Number of Inner adaptation steps for the MAML - algorithm. - maml_optimizer_steps: Number of MAML steps per meta-update iteration - (PPO steps). - inner_lr: Inner Adaptation Step size. - use_meta_env: Use Meta Env Template. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if use_gae is not NotProvided: - self.use_gae = use_gae - if lambda_ is not NotProvided: - self.lambda_ = lambda_ - if kl_coeff is not NotProvided: - self.kl_coeff = kl_coeff - if vf_loss_coeff is not NotProvided: - self.vf_loss_coeff = vf_loss_coeff - if entropy_coeff is not NotProvided: - self.entropy_coeff = entropy_coeff - if clip_param is not NotProvided: - self.clip_param = clip_param - if vf_clip_param is not NotProvided: - self.vf_clip_param = vf_clip_param - if grad_clip is not NotProvided: - self.grad_clip = grad_clip - if kl_target is not NotProvided: - self.kl_target = kl_target - if inner_adaptation_steps is not NotProvided: - self.inner_adaptation_steps = inner_adaptation_steps - if maml_optimizer_steps is not NotProvided: - self.maml_optimizer_steps = maml_optimizer_steps - if inner_lr is not NotProvided: - self.inner_lr = inner_lr - if use_meta_env is not NotProvided: - self.use_meta_env = use_meta_env - - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if self.num_gpus > 1: - raise ValueError("`num_gpus` > 1 not yet supported for MAML!") - if self.inner_adaptation_steps <= 0: - raise ValueError("Inner Adaptation Steps must be >=1!") - if self.maml_optimizer_steps <= 0: - raise ValueError("PPO steps for meta-update needs to be >=0!") - if self.entropy_coeff < 0: - raise ValueError("`entropy_coeff` must be >=0.0!") - if self.batch_mode != "complete_episodes": - raise ValueError("`batch_mode`=truncate_episodes not supported!") - if self.num_rollout_workers <= 0: - raise ValueError("Must have at least 1 worker/task!") - if self.create_env_on_local_worker is False: - raise ValueError( - "Must have an actual Env created on the driver " - "(local) worker! Try setting `config.environment(" - "create_env_on_local_worker=True)`." - ) - - -# @mluo: TODO -def set_worker_tasks(workers, use_meta_env): - if use_meta_env: - n_tasks = len(workers.remote_workers()) - tasks = workers.local_worker().foreach_env(lambda x: x)[0].sample_tasks(n_tasks) - for i, worker in enumerate(workers.remote_workers()): - worker.foreach_env.remote(lambda env: env.set_task(tasks[i])) - - -class MetaUpdate: - def __init__(self, workers, maml_steps, metric_gen, use_meta_env): - self.workers = workers - self.maml_optimizer_steps = maml_steps - self.metric_gen = metric_gen - self.use_meta_env = use_meta_env - - def __call__(self, data_tuple): - # Metaupdate Step - samples = data_tuple[0] - adapt_metrics_dict = data_tuple[1] - - # Metric Updating - metrics = _get_shared_metrics() - metrics.counters[STEPS_SAMPLED_COUNTER] += samples.count - fetches = None - for i in range(self.maml_optimizer_steps): - fetches = self.workers.local_worker().learn_on_batch(samples) - learner_stats = get_learner_stats(fetches) - - # Sync workers with meta policy - self.workers.sync_weights() - - # Set worker tasks - set_worker_tasks(self.workers, self.use_meta_env) - - # Update KLS - def update(pi, pi_id): - assert "inner_kl" not in learner_stats, ( - "inner_kl should be nested under policy id key", - learner_stats, - ) - if pi_id in learner_stats: - assert "inner_kl" in learner_stats[pi_id], (learner_stats, pi_id) - pi.update_kls(learner_stats[pi_id]["inner_kl"]) - else: - logger.warning("No data for {}, not updating kl".format(pi_id)) - - self.workers.local_worker().foreach_policy_to_train(update) - - # Modify Reporting Metrics - metrics = _get_shared_metrics() - metrics.info[LEARNER_INFO] = fetches - metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = samples.count - metrics.counters[STEPS_TRAINED_COUNTER] += samples.count - - res = self.metric_gen.__call__(None) - res.update(adapt_metrics_dict) - - return res - - -def post_process_metrics(adapt_iter, workers, metrics): - # Obtain Current Dataset Metrics and filter out - name = "_adapt_" + str(adapt_iter) if adapt_iter > 0 else "" - - # Only workers are collecting data - res = collect_metrics(workers=workers) - - metrics["episode_reward_max" + str(name)] = res["episode_reward_max"] - metrics["episode_reward_mean" + str(name)] = res["episode_reward_mean"] - metrics["episode_reward_min" + str(name)] = res["episode_reward_min"] - - return metrics - - -def inner_adaptation(workers, samples): - # Each worker performs one gradient descent - for i, e in enumerate(workers.remote_workers()): - e.learn_on_batch.remote(samples[i]) - - -class MAML(Algorithm): - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return MAMLConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - from ray.rllib.algorithms.maml.maml_torch_policy import MAMLTorchPolicy - - return MAMLTorchPolicy - elif config["framework"] == "tf": - from ray.rllib.algorithms.maml.maml_tf_policy import MAMLTF1Policy - - return MAMLTF1Policy - else: - from ray.rllib.algorithms.maml.maml_tf_policy import MAMLTF2Policy - - return MAMLTF2Policy - - @staticmethod - @override(Algorithm) - def execution_plan( - workers: WorkerSet, config: AlgorithmConfig, **kwargs - ) -> LocalIterator[dict]: - assert ( - len(kwargs) == 0 - ), "MAML execution_plan does NOT take any additional parameters" - - # Sync workers with meta policy - workers.sync_weights() - - # Samples and sets worker tasks - use_meta_env = config.use_meta_env - set_worker_tasks(workers, use_meta_env) - - # Metric Collector - metric_collect = CollectMetrics( - workers, - min_history=config.metrics_num_episodes_for_smoothing, - timeout_seconds=config.metrics_episode_collection_timeout_s, - ) - - # Iterator for Inner Adaptation Data gathering (from pre->post - # adaptation) - inner_steps = config.inner_adaptation_steps - - def inner_adaptation_steps(itr): - buf = [] - split = [] - metrics = {} - for samples in itr: - # Processing Samples (Standardize Advantages) - split_lst = [] - for sample in samples: - sample = convert_ma_batch_to_sample_batch(sample) - sample["advantages"] = standardized(sample["advantages"]) - split_lst.append(sample.count) - buf.append(sample) - - split.append(split_lst) - - adapt_iter = len(split) - 1 - metrics = post_process_metrics(adapt_iter, workers, metrics) - if len(split) > inner_steps: - out = concat_samples(buf) - out["split"] = np.array(split) - buf = [] - split = [] - - # Reporting Adaptation Rew Diff - ep_rew_pre = metrics["episode_reward_mean"] - ep_rew_post = metrics[ - "episode_reward_mean_adapt_" + str(inner_steps) - ] - metrics["adaptation_delta"] = ep_rew_post - ep_rew_pre - yield out, metrics - metrics = {} - else: - inner_adaptation(workers, samples) - - rollouts = from_actors(workers.remote_workers()) - rollouts = rollouts.batch_across_shards() - rollouts = rollouts.transform(inner_adaptation_steps) - - # Metaupdate Step - train_op = rollouts.for_each( - MetaUpdate( - workers, config.maml_optimizer_steps, metric_collect, use_meta_env - ) - ) - return train_op diff --git a/rllib_contrib/maml/src/rllib_maml/maml/maml_tf_policy.py b/rllib_contrib/maml/src/rllib_maml/maml/maml_tf_policy.py deleted file mode 100644 index d81bf8d834ecc..0000000000000 --- a/rllib_contrib/maml/src/rllib_maml/maml/maml_tf_policy.py +++ /dev/null @@ -1,520 +0,0 @@ -import logging -from typing import Dict, List, Type, Union - -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.evaluation.postprocessing import ( - Postprocessing, - compute_gae_for_sample_batch, -) -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_action_dist import TFActionDistribution -from ray.rllib.models.utils import get_activation_fn -from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - LocalOptimizer, - ModelGradients, - ValueNetworkMixin, - compute_gradients, -) -from ray.rllib.utils import try_import_tf -from ray.rllib.utils.annotations import override -from ray.rllib.utils.typing import TensorType - -tf1, tf, tfv = try_import_tf() - -logger = logging.getLogger(__name__) - - -def PPOLoss( - dist_class, - actions, - curr_logits, - behaviour_logits, - advantages, - value_fn, - value_targets, - vf_preds, - cur_kl_coeff, - entropy_coeff, - clip_param, - vf_clip_param, - vf_loss_coeff, - clip_loss=False, -): - def surrogate_loss( - actions, curr_dist, prev_dist, advantages, clip_param, clip_loss - ): - pi_new_logp = curr_dist.logp(actions) - pi_old_logp = prev_dist.logp(actions) - - logp_ratio = tf.math.exp(pi_new_logp - pi_old_logp) - if clip_loss: - return tf.minimum( - advantages * logp_ratio, - advantages - * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param), - ) - return advantages * logp_ratio - - def kl_loss(curr_dist, prev_dist): - return prev_dist.kl(curr_dist) - - def entropy_loss(dist): - return dist.entropy() - - def vf_loss(value_fn, value_targets, vf_preds, vf_clip_param=0.1): - # GAE Value Function Loss - vf_loss1 = tf.math.square(value_fn - value_targets) - vf_clipped = vf_preds + tf.clip_by_value( - value_fn - vf_preds, -vf_clip_param, vf_clip_param - ) - vf_loss2 = tf.math.square(vf_clipped - value_targets) - vf_loss = tf.maximum(vf_loss1, vf_loss2) - return vf_loss - - pi_new_dist = dist_class(curr_logits, None) - pi_old_dist = dist_class(behaviour_logits, None) - - surr_loss = tf.reduce_mean( - surrogate_loss( - actions, pi_new_dist, pi_old_dist, advantages, clip_param, clip_loss - ) - ) - kl_loss = tf.reduce_mean(kl_loss(pi_new_dist, pi_old_dist)) - vf_loss = tf.reduce_mean(vf_loss(value_fn, value_targets, vf_preds, vf_clip_param)) - entropy_loss = tf.reduce_mean(entropy_loss(pi_new_dist)) - - total_loss = -surr_loss + cur_kl_coeff * kl_loss - total_loss += vf_loss_coeff * vf_loss - entropy_coeff * entropy_loss - return total_loss, surr_loss, kl_loss, vf_loss, entropy_loss - - -# This is the computation graph for workers (inner adaptation steps) -class WorkerLoss(object): - def __init__( - self, - dist_class, - actions, - curr_logits, - behaviour_logits, - advantages, - value_fn, - value_targets, - vf_preds, - cur_kl_coeff, - entropy_coeff, - clip_param, - vf_clip_param, - vf_loss_coeff, - clip_loss=False, - ): - self.loss, surr_loss, kl_loss, vf_loss, ent_loss = PPOLoss( - dist_class=dist_class, - actions=actions, - curr_logits=curr_logits, - behaviour_logits=behaviour_logits, - advantages=advantages, - value_fn=value_fn, - value_targets=value_targets, - vf_preds=vf_preds, - cur_kl_coeff=cur_kl_coeff, - entropy_coeff=entropy_coeff, - clip_param=clip_param, - vf_clip_param=vf_clip_param, - vf_loss_coeff=vf_loss_coeff, - clip_loss=clip_loss, - ) - self.loss = tf1.Print(self.loss, ["Worker Adapt Loss", self.loss]) - - -# This is the Meta-Update computation graph for main (meta-update step) -class MAMLLoss(object): - def __init__( - self, - model, - config, - dist_class, - value_targets, - advantages, - actions, - behaviour_logits, - vf_preds, - cur_kl_coeff, - policy_vars, - obs, - num_tasks, - split, - inner_adaptation_steps=1, - entropy_coeff=0, - clip_param=0.3, - vf_clip_param=0.1, - vf_loss_coeff=1.0, - use_gae=True, - ): - self.config = config - self.num_tasks = num_tasks - self.inner_adaptation_steps = inner_adaptation_steps - self.clip_param = clip_param - self.dist_class = dist_class - self.cur_kl_coeff = cur_kl_coeff - - # Split episode tensors into [inner_adaptation_steps+1, num_tasks, -1] - self.obs = self.split_placeholders(obs, split) - self.actions = self.split_placeholders(actions, split) - self.behaviour_logits = self.split_placeholders(behaviour_logits, split) - self.advantages = self.split_placeholders(advantages, split) - self.value_targets = self.split_placeholders(value_targets, split) - self.vf_preds = self.split_placeholders(vf_preds, split) - - # Construct name to tensor dictionary for easier indexing - self.policy_vars = {} - for var in policy_vars: - self.policy_vars[var.name] = var - - # Calculate pi_new for PPO - pi_new_logits, current_policy_vars, value_fns = [], [], [] - for i in range(self.num_tasks): - pi_new, value_fn = self.feed_forward( - self.obs[0][i], self.policy_vars, policy_config=config["model"] - ) - pi_new_logits.append(pi_new) - value_fns.append(value_fn) - current_policy_vars.append(self.policy_vars) - - inner_kls = [] - inner_ppo_loss = [] - - # Recompute weights for inner-adaptation (same weights as workers) - for step in range(self.inner_adaptation_steps): - kls = [] - for i in range(self.num_tasks): - # PPO Loss Function (only Surrogate) - ppo_loss, _, kl_loss, _, _ = PPOLoss( - dist_class=dist_class, - actions=self.actions[step][i], - curr_logits=pi_new_logits[i], - behaviour_logits=self.behaviour_logits[step][i], - advantages=self.advantages[step][i], - value_fn=value_fns[i], - value_targets=self.value_targets[step][i], - vf_preds=self.vf_preds[step][i], - cur_kl_coeff=0.0, - entropy_coeff=entropy_coeff, - clip_param=clip_param, - vf_clip_param=vf_clip_param, - vf_loss_coeff=vf_loss_coeff, - clip_loss=False, - ) - adapted_policy_vars = self.compute_updated_variables( - ppo_loss, current_policy_vars[i] - ) - pi_new_logits[i], value_fns[i] = self.feed_forward( - self.obs[step + 1][i], - adapted_policy_vars, - policy_config=config["model"], - ) - current_policy_vars[i] = adapted_policy_vars - kls.append(kl_loss) - inner_ppo_loss.append(ppo_loss) - - self.kls = kls - inner_kls.append(kls) - - mean_inner_kl = tf.stack( - [tf.reduce_mean(tf.stack(inner_kl)) for inner_kl in inner_kls] - ) - self.mean_inner_kl = mean_inner_kl - - ppo_obj = [] - for i in range(self.num_tasks): - ppo_loss, surr_loss, kl_loss, val_loss, entropy_loss = PPOLoss( - dist_class=dist_class, - actions=self.actions[self.inner_adaptation_steps][i], - curr_logits=pi_new_logits[i], - behaviour_logits=self.behaviour_logits[self.inner_adaptation_steps][i], - advantages=self.advantages[self.inner_adaptation_steps][i], - value_fn=value_fns[i], - value_targets=self.value_targets[self.inner_adaptation_steps][i], - vf_preds=self.vf_preds[self.inner_adaptation_steps][i], - cur_kl_coeff=0.0, - entropy_coeff=entropy_coeff, - clip_param=clip_param, - vf_clip_param=vf_clip_param, - vf_loss_coeff=vf_loss_coeff, - clip_loss=True, - ) - ppo_obj.append(ppo_loss) - self.mean_policy_loss = surr_loss - self.mean_kl = kl_loss - self.mean_vf_loss = val_loss - self.mean_entropy = entropy_loss - self.inner_kl_loss = tf.reduce_mean( - tf.multiply(self.cur_kl_coeff, mean_inner_kl) - ) - self.loss = tf.reduce_mean(tf.stack(ppo_obj, axis=0)) + self.inner_kl_loss - self.loss = tf1.Print( - self.loss, ["Meta-Loss", self.loss, "Inner KL", self.mean_inner_kl] - ) - - def feed_forward(self, obs, policy_vars, policy_config): - # Hacky for now, reconstruct FC network with adapted weights - # @mluo: TODO for any network - def fc_network( - inp, network_vars, hidden_nonlinearity, output_nonlinearity, policy_config - ): - bias_added = False - x = inp - for name, param in network_vars.items(): - if "kernel" in name: - x = tf.matmul(x, param) - elif "bias" in name: - x = tf.add(x, param) - bias_added = True - else: - raise NameError - - if bias_added: - if "out" not in name: - x = hidden_nonlinearity(x) - elif "out" in name: - x = output_nonlinearity(x) - else: - raise NameError - bias_added = False - return x - - policyn_vars = {} - valuen_vars = {} - log_std = None - for name, param in policy_vars.items(): - if "value" in name: - valuen_vars[name] = param - elif "log_std" in name: - log_std = param - else: - policyn_vars[name] = param - - output_nonlinearity = tf.identity - hidden_nonlinearity = get_activation_fn(policy_config["fcnet_activation"]) - - pi_new_logits = fc_network( - obs, policyn_vars, hidden_nonlinearity, output_nonlinearity, policy_config - ) - if log_std is not None: - pi_new_logits = tf.concat([pi_new_logits, 0.0 * pi_new_logits + log_std], 1) - value_fn = fc_network( - obs, valuen_vars, hidden_nonlinearity, output_nonlinearity, policy_config - ) - - return pi_new_logits, tf.reshape(value_fn, [-1]) - - def compute_updated_variables(self, loss, network_vars): - grad = tf.gradients(loss, list(network_vars.values())) - adapted_vars = {} - for i, tup in enumerate(network_vars.items()): - name, var = tup - if grad[i] is None: - adapted_vars[name] = var - else: - adapted_vars[name] = var - self.config["inner_lr"] * grad[i] - return adapted_vars - - def split_placeholders(self, placeholder, split): - inner_placeholder_list = tf.split( - placeholder, tf.math.reduce_sum(split, axis=1), axis=0 - ) - placeholder_list = [] - for index, split_placeholder in enumerate(inner_placeholder_list): - placeholder_list.append(tf.split(split_placeholder, split[index], axis=0)) - return placeholder_list - - -class KLCoeffMixin: - def __init__(self, config): - self.kl_coeff_val = [config["kl_coeff"]] * config["inner_adaptation_steps"] - self.kl_target = self.config["kl_target"] - self.kl_coeff = tf1.get_variable( - initializer=tf.keras.initializers.Constant(self.kl_coeff_val), - name="kl_coeff", - shape=(config["inner_adaptation_steps"]), - trainable=False, - dtype=tf.float32, - ) - - def update_kls(self, sampled_kls): - for i, kl in enumerate(sampled_kls): - if kl < self.kl_target / 1.5: - self.kl_coeff_val[i] *= 0.5 - elif kl > 1.5 * self.kl_target: - self.kl_coeff_val[i] *= 2.0 - print(self.kl_coeff_val) - self.kl_coeff.load(self.kl_coeff_val, session=self.get_session()) - return self.kl_coeff_val - - -# We need this builder function because we want to share the same -# custom logics between TF1 dynamic and TF2 eager policies. -def get_maml_tf_policy(name: str, base: type) -> type: - """Construct a MAMLTFPolicy inheriting either dynamic or eager base policies. - - Args: - base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. - - Returns: - A TF Policy to be used with MAML. - """ - - class MAMLTFPolicy(KLCoeffMixin, ValueNetworkMixin, base): - def __init__( - self, - observation_space, - action_space, - config, - existing_model=None, - existing_inputs=None, - ): - # First thing first, enable eager execution if necessary. - base.enable_eager_execution_if_necessary() - - validate_config(config) - - # Initialize base class. - base.__init__( - self, - observation_space, - action_space, - config, - existing_inputs=existing_inputs, - existing_model=existing_model, - ) - - KLCoeffMixin.__init__(self, config) - ValueNetworkMixin.__init__(self, config) - - # Create the `split` placeholder before initialize loss. - if self.framework == "tf": - self._loss_input_dict["split"] = tf1.placeholder( - tf.int32, - name="Meta-Update-Splitting", - shape=( - self.config["inner_adaptation_steps"] + 1, - self.config["num_workers"], - ), - ) - - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() - - @override(base) - def loss( - self, - model: Union[ModelV2, "tf.keras.Model"], - dist_class: Type[TFActionDistribution], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - logits, state = model(train_batch) - self.cur_lr = self.config["lr"] - - if self.config["worker_index"]: - self.loss_obj = WorkerLoss( - dist_class=dist_class, - actions=train_batch[SampleBatch.ACTIONS], - curr_logits=logits, - behaviour_logits=train_batch[SampleBatch.ACTION_DIST_INPUTS], - advantages=train_batch[Postprocessing.ADVANTAGES], - value_fn=model.value_function(), - value_targets=train_batch[Postprocessing.VALUE_TARGETS], - vf_preds=train_batch[SampleBatch.VF_PREDS], - cur_kl_coeff=0.0, - entropy_coeff=self.config["entropy_coeff"], - clip_param=self.config["clip_param"], - vf_clip_param=self.config["vf_clip_param"], - vf_loss_coeff=self.config["vf_loss_coeff"], - clip_loss=False, - ) - else: - self.var_list = tf1.get_collection( - tf1.GraphKeys.TRAINABLE_VARIABLES, tf1.get_variable_scope().name - ) - self.loss_obj = MAMLLoss( - model=model, - dist_class=dist_class, - value_targets=train_batch[Postprocessing.VALUE_TARGETS], - advantages=train_batch[Postprocessing.ADVANTAGES], - actions=train_batch[SampleBatch.ACTIONS], - behaviour_logits=train_batch[SampleBatch.ACTION_DIST_INPUTS], - vf_preds=train_batch[SampleBatch.VF_PREDS], - cur_kl_coeff=self.kl_coeff, - policy_vars=self.var_list, - obs=train_batch[SampleBatch.CUR_OBS], - num_tasks=self.config["num_workers"], - split=train_batch["split"], - config=self.config, - inner_adaptation_steps=self.config["inner_adaptation_steps"], - entropy_coeff=self.config["entropy_coeff"], - clip_param=self.config["clip_param"], - vf_clip_param=self.config["vf_clip_param"], - vf_loss_coeff=self.config["vf_loss_coeff"], - use_gae=self.config["use_gae"], - ) - - return self.loss_obj.loss - - @override(base) - def optimizer( - self, - ) -> Union[ - "tf.keras.optimizers.Optimizer", List["tf.keras.optimizers.Optimizer"] - ]: - """ - Workers use simple SGD for inner adaptation - Meta-Policy uses Adam optimizer for meta-update - """ - if not self.config["worker_index"]: - return tf1.train.AdamOptimizer(learning_rate=self.config["lr"]) - return tf1.train.GradientDescentOptimizer( - learning_rate=self.config["inner_lr"] - ) - - @override(base) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - if self.config["worker_index"]: - return {"worker_loss": self.loss_obj.loss} - else: - return { - "cur_kl_coeff": tf.cast(self.kl_coeff, tf.float64), - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "total_loss": self.loss_obj.loss, - "policy_loss": self.loss_obj.mean_policy_loss, - "vf_loss": self.loss_obj.mean_vf_loss, - "kl": self.loss_obj.mean_kl, - "inner_kl": self.loss_obj.mean_inner_kl, - "entropy": self.loss_obj.mean_entropy, - } - - @override(base) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - sample_batch = super().postprocess_trajectory(sample_batch) - return compute_gae_for_sample_batch( - self, sample_batch, other_agent_batches, episode - ) - - @override(base) - def compute_gradients_fn( - self, optimizer: LocalOptimizer, loss: TensorType - ) -> ModelGradients: - return compute_gradients(self, optimizer, loss) - - MAMLTFPolicy.__name__ = name - MAMLTFPolicy.__qualname__ = name - - return MAMLTFPolicy - - -MAMLTF1Policy = get_maml_tf_policy("MAMLTF1Policy", DynamicTFPolicyV2) -MAMLTF2Policy = get_maml_tf_policy("MAMLTF2Policy", EagerTFPolicyV2) diff --git a/rllib_contrib/maml/src/rllib_maml/maml/maml_torch_policy.py b/rllib_contrib/maml/src/rllib_maml/maml/maml_torch_policy.py deleted file mode 100644 index 4a16f5eb950a1..0000000000000 --- a/rllib_contrib/maml/src/rllib_maml/maml/maml_torch_policy.py +++ /dev/null @@ -1,449 +0,0 @@ -import logging -from typing import Dict, List, Type, Union - -import ray -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.evaluation.postprocessing import ( - Postprocessing, - compute_gae_for_sample_batch, -) -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import ValueNetworkMixin -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.torch_utils import apply_grad_clipping -from ray.rllib.utils.typing import TensorType - -torch, nn = try_import_torch() -logger = logging.getLogger(__name__) - -try: - import higher -except (ImportError, ModuleNotFoundError): - raise ImportError( - ( - "The MAML and MB-MPO algorithms require the `higher` module to be " - "installed! However, there was no installation found. You can install it " - "via `pip install higher`." - ) - ) - - -def PPOLoss( - dist_class, - actions, - curr_logits, - behaviour_logits, - advantages, - value_fn, - value_targets, - vf_preds, - cur_kl_coeff, - entropy_coeff, - clip_param, - vf_clip_param, - vf_loss_coeff, - clip_loss=False, -): - def surrogate_loss( - actions, curr_dist, prev_dist, advantages, clip_param, clip_loss - ): - pi_new_logp = curr_dist.logp(actions) - pi_old_logp = prev_dist.logp(actions) - - logp_ratio = torch.exp(pi_new_logp - pi_old_logp) - if clip_loss: - return torch.min( - advantages * logp_ratio, - advantages * torch.clamp(logp_ratio, 1 - clip_param, 1 + clip_param), - ) - return advantages * logp_ratio - - def kl_loss(curr_dist, prev_dist): - return prev_dist.kl(curr_dist) - - def entropy_loss(dist): - return dist.entropy() - - def vf_loss(value_fn, value_targets, vf_preds, vf_clip_param=0.1): - # GAE Value Function Loss - vf_loss1 = torch.pow(value_fn - value_targets, 2.0) - vf_clipped = vf_preds + torch.clamp( - value_fn - vf_preds, -vf_clip_param, vf_clip_param - ) - vf_loss2 = torch.pow(vf_clipped - value_targets, 2.0) - vf_loss = torch.max(vf_loss1, vf_loss2) - return vf_loss - - pi_new_dist = dist_class(curr_logits, None) - pi_old_dist = dist_class(behaviour_logits, None) - - surr_loss = torch.mean( - surrogate_loss( - actions, pi_new_dist, pi_old_dist, advantages, clip_param, clip_loss - ) - ) - kl_loss = torch.mean(kl_loss(pi_new_dist, pi_old_dist)) - vf_loss = torch.mean(vf_loss(value_fn, value_targets, vf_preds, vf_clip_param)) - entropy_loss = torch.mean(entropy_loss(pi_new_dist)) - - total_loss = -surr_loss + cur_kl_coeff * kl_loss - total_loss += vf_loss_coeff * vf_loss - total_loss -= entropy_coeff * entropy_loss - return total_loss, surr_loss, kl_loss, vf_loss, entropy_loss - - -# This is the computation graph for workers (inner adaptation steps) -class WorkerLoss(object): - def __init__( - self, - model, - dist_class, - actions, - curr_logits, - behaviour_logits, - advantages, - value_fn, - value_targets, - vf_preds, - cur_kl_coeff, - entropy_coeff, - clip_param, - vf_clip_param, - vf_loss_coeff, - clip_loss=False, - ): - self.loss, surr_loss, kl_loss, vf_loss, ent_loss = PPOLoss( - dist_class=dist_class, - actions=actions, - curr_logits=curr_logits, - behaviour_logits=behaviour_logits, - advantages=advantages, - value_fn=value_fn, - value_targets=value_targets, - vf_preds=vf_preds, - cur_kl_coeff=cur_kl_coeff, - entropy_coeff=entropy_coeff, - clip_param=clip_param, - vf_clip_param=vf_clip_param, - vf_loss_coeff=vf_loss_coeff, - clip_loss=clip_loss, - ) - - -# This is the Meta-Update computation graph for main (meta-update step) -class MAMLLoss(object): - def __init__( - self, - model, - config, - dist_class, - value_targets, - advantages, - actions, - behaviour_logits, - vf_preds, - cur_kl_coeff, - policy_vars, - obs, - num_tasks, - split, - meta_opt, - inner_adaptation_steps=1, - entropy_coeff=0, - clip_param=0.3, - vf_clip_param=0.1, - vf_loss_coeff=1.0, - use_gae=True, - ): - self.config = config - self.num_tasks = num_tasks - self.inner_adaptation_steps = inner_adaptation_steps - self.clip_param = clip_param - self.dist_class = dist_class - self.cur_kl_coeff = cur_kl_coeff - self.model = model - self.vf_clip_param = vf_clip_param - self.vf_loss_coeff = vf_loss_coeff - self.entropy_coeff = entropy_coeff - - # Split episode tensors into [inner_adaptation_steps+1, num_tasks, -1] - self.obs = self.split_placeholders(obs, split) - self.actions = self.split_placeholders(actions, split) - self.behaviour_logits = self.split_placeholders(behaviour_logits, split) - self.advantages = self.split_placeholders(advantages, split) - self.value_targets = self.split_placeholders(value_targets, split) - self.vf_preds = self.split_placeholders(vf_preds, split) - - inner_opt = torch.optim.SGD(model.parameters(), lr=config["inner_lr"]) - surr_losses = [] - val_losses = [] - kl_losses = [] - entropy_losses = [] - meta_losses = [] - kls = [] - - meta_opt.zero_grad() - for i in range(self.num_tasks): - with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=False) as ( - fnet, - diffopt, - ): - inner_kls = [] - for step in range(self.inner_adaptation_steps): - ppo_loss, _, inner_kl_loss, _, _ = self.compute_losses( - fnet, step, i - ) - diffopt.step(ppo_loss) - inner_kls.append(inner_kl_loss) - kls.append(inner_kl_loss.detach()) - - # Meta Update - ppo_loss, s_loss, kl_loss, v_loss, ent = self.compute_losses( - fnet, self.inner_adaptation_steps - 1, i, clip_loss=True - ) - - inner_loss = torch.mean( - torch.stack( - [ - a * b - for a, b in zip( - self.cur_kl_coeff[ - i - * self.inner_adaptation_steps : (i + 1) - * self.inner_adaptation_steps - ], - inner_kls, - ) - ] - ) - ) - meta_loss = (ppo_loss + inner_loss) / self.num_tasks - meta_loss.backward() - - surr_losses.append(s_loss.detach()) - kl_losses.append(kl_loss.detach()) - val_losses.append(v_loss.detach()) - entropy_losses.append(ent.detach()) - meta_losses.append(meta_loss.detach()) - - meta_opt.step() - - # Stats Logging - self.mean_policy_loss = torch.mean(torch.stack(surr_losses)) - self.mean_kl_loss = torch.mean(torch.stack(kl_losses)) - self.mean_vf_loss = torch.mean(torch.stack(val_losses)) - self.mean_entropy = torch.mean(torch.stack(entropy_losses)) - self.mean_inner_kl = kls - self.loss = torch.sum(torch.stack(meta_losses)) - # Hacky, needed to bypass RLlib backend - self.loss.requires_grad = True - - def compute_losses(self, model, inner_adapt_iter, task_iter, clip_loss=False): - obs = self.obs[inner_adapt_iter][task_iter] - obs_dict = {"obs": obs, "obs_flat": obs} - curr_logits, _ = model.forward(obs_dict, None, None) - value_fns = model.value_function() - ppo_loss, surr_loss, kl_loss, val_loss, ent_loss = PPOLoss( - dist_class=self.dist_class, - actions=self.actions[inner_adapt_iter][task_iter], - curr_logits=curr_logits, - behaviour_logits=self.behaviour_logits[inner_adapt_iter][task_iter], - advantages=self.advantages[inner_adapt_iter][task_iter], - value_fn=value_fns, - value_targets=self.value_targets[inner_adapt_iter][task_iter], - vf_preds=self.vf_preds[inner_adapt_iter][task_iter], - cur_kl_coeff=0.0, - entropy_coeff=self.entropy_coeff, - clip_param=self.clip_param, - vf_clip_param=self.vf_clip_param, - vf_loss_coeff=self.vf_loss_coeff, - clip_loss=clip_loss, - ) - return ppo_loss, surr_loss, kl_loss, val_loss, ent_loss - - def split_placeholders(self, placeholder, split): - inner_placeholder_list = torch.split( - placeholder, torch.sum(split, dim=1).tolist(), dim=0 - ) - placeholder_list = [] - for index, split_placeholder in enumerate(inner_placeholder_list): - placeholder_list.append( - torch.split(split_placeholder, split[index].tolist(), dim=0) - ) - return placeholder_list - - -class KLCoeffMixin: - def __init__(self, config): - self.kl_coeff_val = ( - [config["kl_coeff"]] - * config["inner_adaptation_steps"] - * config["num_workers"] - ) - self.kl_target = self.config["kl_target"] - - def update_kls(self, sampled_kls): - for i, kl in enumerate(sampled_kls): - if kl < self.kl_target / 1.5: - self.kl_coeff_val[i] *= 0.5 - elif kl > 1.5 * self.kl_target: - self.kl_coeff_val[i] *= 2.0 - return self.kl_coeff_val - - -class MAMLTorchPolicy(ValueNetworkMixin, KLCoeffMixin, TorchPolicyV2): - """PyTorch policy class used with MAML.""" - - def __init__(self, observation_space, action_space, config): - config = dict(ray.rllib.algorithms.maml.maml.MAMLConfig(), **config) - validate_config(config) - - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - - KLCoeffMixin.__init__(self, config) - ValueNetworkMixin.__init__(self, config) - - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() - - @override(TorchPolicyV2) - def loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - """Constructs the loss function. - - Args: - model: The Model to calculate the loss for. - dist_class: The action distr. class. - train_batch: The training data. - - Returns: - The PPO loss tensor given the input batch. - """ - logits, state = model(train_batch) - self.cur_lr = self.config["lr"] - - if self.config["worker_index"]: - self.loss_obj = WorkerLoss( - model=model, - dist_class=dist_class, - actions=train_batch[SampleBatch.ACTIONS], - curr_logits=logits, - behaviour_logits=train_batch[SampleBatch.ACTION_DIST_INPUTS], - advantages=train_batch[Postprocessing.ADVANTAGES], - value_fn=model.value_function(), - value_targets=train_batch[Postprocessing.VALUE_TARGETS], - vf_preds=train_batch[SampleBatch.VF_PREDS], - cur_kl_coeff=0.0, - entropy_coeff=self.config["entropy_coeff"], - clip_param=self.config["clip_param"], - vf_clip_param=self.config["vf_clip_param"], - vf_loss_coeff=self.config["vf_loss_coeff"], - clip_loss=False, - ) - else: - self.var_list = model.named_parameters() - - # `split` may not exist yet (during test-loss call), use a dummy value. - # Cannot use get here due to train_batch being a TrackingDict. - if "split" in train_batch: - split = train_batch["split"] - else: - split_shape = ( - self.config["inner_adaptation_steps"], - self.config["num_workers"], - ) - split_const = int( - train_batch["obs"].shape[0] // (split_shape[0] * split_shape[1]) - ) - split = torch.ones(split_shape, dtype=int) * split_const - self.loss_obj = MAMLLoss( - model=model, - dist_class=dist_class, - value_targets=train_batch[Postprocessing.VALUE_TARGETS], - advantages=train_batch[Postprocessing.ADVANTAGES], - actions=train_batch[SampleBatch.ACTIONS], - behaviour_logits=train_batch[SampleBatch.ACTION_DIST_INPUTS], - vf_preds=train_batch[SampleBatch.VF_PREDS], - cur_kl_coeff=self.kl_coeff_val, - policy_vars=self.var_list, - obs=train_batch[SampleBatch.CUR_OBS], - num_tasks=self.config["num_workers"], - split=split, - config=self.config, - inner_adaptation_steps=self.config["inner_adaptation_steps"], - entropy_coeff=self.config["entropy_coeff"], - clip_param=self.config["clip_param"], - vf_clip_param=self.config["vf_clip_param"], - vf_loss_coeff=self.config["vf_loss_coeff"], - use_gae=self.config["use_gae"], - meta_opt=self.meta_opt, - ) - - return self.loss_obj.loss - - @override(TorchPolicyV2) - def optimizer( - self, - ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]: - """ - Workers use simple SGD for inner adaptation - Meta-Policy uses Adam optimizer for meta-update - """ - if not self.config["worker_index"]: - self.meta_opt = torch.optim.Adam( - self.model.parameters(), lr=self.config["lr"] - ) - return self.meta_opt - return torch.optim.SGD(self.model.parameters(), lr=self.config["inner_lr"]) - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - if self.config["worker_index"]: - return convert_to_numpy({"worker_loss": self.loss_obj.loss}) - else: - return convert_to_numpy( - { - "cur_kl_coeff": self.kl_coeff_val, - "cur_lr": self.cur_lr, - "total_loss": self.loss_obj.loss, - "policy_loss": self.loss_obj.mean_policy_loss, - "vf_loss": self.loss_obj.mean_vf_loss, - "kl_loss": self.loss_obj.mean_kl_loss, - "inner_kl": self.loss_obj.mean_inner_kl, - "entropy": self.loss_obj.mean_entropy, - } - ) - - @override(TorchPolicyV2) - def extra_grad_process( - self, optimizer: "torch.optim.Optimizer", loss: TensorType - ) -> Dict[str, TensorType]: - return apply_grad_clipping(self, optimizer, loss) - - @override(TorchPolicyV2) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - # Do all post-processing always with no_grad(). - # Not using this here will introduce a memory leak - # in torch (issue #6962). - # TODO: no_grad still necessary? - with torch.no_grad(): - return compute_gae_for_sample_batch( - self, sample_batch, other_agent_batches, episode - ) diff --git a/rllib_contrib/maml/tests/test_maml.py b/rllib_contrib/maml/tests/test_maml.py deleted file mode 100644 index 774be4ecde413..0000000000000 --- a/rllib_contrib/maml/tests/test_maml.py +++ /dev/null @@ -1,61 +0,0 @@ -import unittest - -from gymnasium.wrappers import TimeLimit -from rllib_maml.envs.cartpole_mass import CartPoleMassEnv -from rllib_maml.envs.pendulum_mass import PendulumMassEnv -from rllib_maml.maml import MAMLConfig - -import ray -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) -from ray.tune.registry import register_env - - -class TestMAML(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - register_env( - "cartpole", - lambda env_cfg: TimeLimit(CartPoleMassEnv(), max_episode_steps=200), - ) - register_env( - "pendulum", - lambda env_cfg: TimeLimit(PendulumMassEnv(), max_episode_steps=200), - ) - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_maml_compilation(self): - """Test whether MAML can be built with all frameworks.""" - config = MAMLConfig().rollouts(num_rollout_workers=1) - - num_iterations = 1 - - # Test for tf framework (torch not implemented yet). - for fw in framework_iterator(config, frameworks=("tf", "torch")): - for env in ["cartpole", "pendulum"]: - if fw == "tf" and env.startswith("cartpole"): - continue - print("env={}".format(env)) - config.environment(env) - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo, include_prev_action_reward=True) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/maml/tuned_examples/__init__.py b/rllib_contrib/maml/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/maml/tuned_examples/ant-rand-goal-maml.yaml b/rllib_contrib/maml/tuned_examples/ant-rand-goal-maml.yaml deleted file mode 100644 index 02bf0c5bc5588..0000000000000 --- a/rllib_contrib/maml/tuned_examples/ant-rand-goal-maml.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -ant-rand-goal-maml: - env: ray.rllib.examples.env.ant_rand_goal.AntRandGoalEnv - run: MAML - stop: - training_iteration: 1000 - config: - rollout_fragment_length: 200 - num_envs_per_worker: 20 - inner_adaptation_steps: 2 - maml_optimizer_steps: 5 - gamma: 0.99 - lambda: 1.0 - lr: 0.001 - vf_loss_coeff: 0.5 - clip_param: 0.3 - kl_target: 0.01 - kl_coeff: 0.0005 - num_workers: 32 - num_gpus: 1 - inner_lr: 0.03 - explore: True - clip_actions: False - model: - fcnet_hiddens: [64, 64] - free_log_std: True diff --git a/rllib_contrib/maml/tuned_examples/cartpole-maml.yaml b/rllib_contrib/maml/tuned_examples/cartpole-maml.yaml deleted file mode 100644 index 55c670f01ef4b..0000000000000 --- a/rllib_contrib/maml/tuned_examples/cartpole-maml.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# Same configs as Pendulum -cartpole-maml: - env: CartPole-v1 - run: MAML - stop: - training_iteration: 100 - config: - # Works with both frameworks, "tf" and "torch". - framework: torch - rollout_fragment_length: 200 - num_envs_per_worker: 10 - inner_adaptation_steps: 1 - maml_optimizer_steps: 5 - gamma: 0.99 - lambda: 1.0 - lr: 0.001 - vf_loss_coeff: 0.5 - clip_param: 0.3 - kl_target: 0.01 - kl_coeff: 0.001 - num_workers: 20 - num_gpus: 1 - inner_lr: 0.03 - clip_actions: False - use_meta_env: False - model: - fcnet_hiddens: [64, 64] diff --git a/rllib_contrib/maml/tuned_examples/halfcheetah-rand-direc-maml.yaml b/rllib_contrib/maml/tuned_examples/halfcheetah-rand-direc-maml.yaml deleted file mode 100644 index 91790c2ccdd04..0000000000000 --- a/rllib_contrib/maml/tuned_examples/halfcheetah-rand-direc-maml.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -halfcheetah-rand-direc-maml: - env: ray.rllib.examples.env.halfcheetah_rand_direc.HalfCheetahRandDirecEnv - run: MAML - stop: - training_iteration: 1000 - config: - rollout_fragment_length: 100 - num_envs_per_worker: 20 - inner_adaptation_steps: 1 - maml_optimizer_steps: 5 - gamma: 0.99 - lambda: 1.0 - lr: 0.001 - vf_loss_coeff: 0.5 - clip_param: 0.3 - kl_target: 0.01 - kl_coeff: 0.0005 - num_workers: 31 - num_gpus: 1 - inner_lr: 0.1 - clip_actions: False - model: - fcnet_hiddens: [64, 64] - free_log_std: True diff --git a/rllib_contrib/maml/tuned_examples/pendulum-mass-maml.yaml b/rllib_contrib/maml/tuned_examples/pendulum-mass-maml.yaml deleted file mode 100644 index 67dbce562c855..0000000000000 --- a/rllib_contrib/maml/tuned_examples/pendulum-mass-maml.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pendulum-mass-maml: - env: ray.rllib.examples.env.pendulum_mass.PendulumMassEnv - run: MAML - stop: - training_iteration: 500 - config: - rollout_fragment_length: 200 - num_envs_per_worker: 10 - inner_adaptation_steps: 1 - maml_optimizer_steps: 5 - gamma: 0.99 - lambda: 1.0 - lr: 0.001 - vf_loss_coeff: 0.5 - clip_param: 0.3 - kl_target: 0.01 - kl_coeff: 0.001 - num_workers: 20 - num_gpus: 1 - inner_lr: 0.03 - explore: True - clip_actions: False - model: - fcnet_hiddens: [64, 64] - free_log_std: True diff --git a/rllib_contrib/mbmpo/BUILD b/rllib_contrib/mbmpo/BUILD deleted file mode 100644 index 830333bae94b6..0000000000000 --- a/rllib_contrib/mbmpo/BUILD +++ /dev/null @@ -1,34 +0,0 @@ -# Examples - -py_test( - name = "example_mbmpo_cartpole_v1_model_based", - main = "mbmpo_cartpole_v1_model_based.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/mbmpo_cartpole_v1_model_based.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -# Working, but takes a long time to learn (>15min). -# Removed due to Higher API conflicts with Pytorch-Import tests -# MB-MPO -# py_test( -# name = "learning_tests_pendulum_mbmpo", -# main = "run_regression_tests.py", -# tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], -# size = "large", -# srcs = ["run_regression_tests.py"], -# data = ["tuned_examples/pendulum-mbmpo.yaml"], -# args = ["--dir=mbmpo/tuned_examples/"] -# ) - -# Compilation Tests - -py_test( - name = "test_mbmpo", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_mbmpo.py"] -) diff --git a/rllib_contrib/mbmpo/README.md b/rllib_contrib/mbmpo/README.md deleted file mode 100644 index 22525a89619b2..0000000000000 --- a/rllib_contrib/mbmpo/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# MBMPO (Model-Based Meta-Policy-Optimization) - -[MBMPO](https://arxiv.org/pdf/1809.05214.pdf) is a Dyna-styled model-based RL method that learns based on the predictions of an ensemble of transition-dynamics models. Similar to MAML, MBMPO metalearns an optimal policy by treating each dynamics model as a different task. Similar to the original paper, MBMPO is evaluated on MuJoCo, with the horizon set to 200 instead of the default 1000. - -Additional statistics are logged in MBMPO. Each MBMPO iteration corresponds to multiple MAML iterations, and `MAMLIter_i_DynaTrajInner_j_episode_reward_mean` measures the agent’s returns across the dynamics models at iteration i of MAML and step j of inner adaptation. - -## Installation - -``` -conda create -n rllib-mbmpo python=3.10 -conda activate rllib-mbmpo -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[MBMPO Example]() \ No newline at end of file diff --git a/rllib_contrib/mbmpo/examples/mbmpo_cartpole_v1_model_based.py b/rllib_contrib/mbmpo/examples/mbmpo_cartpole_v1_model_based.py deleted file mode 100644 index ab77042515dba..0000000000000 --- a/rllib_contrib/mbmpo/examples/mbmpo_cartpole_v1_model_based.py +++ /dev/null @@ -1,68 +0,0 @@ -import argparse - -from gymnasium.wrappers import TimeLimit -from rllib_mbmpo.env.mbmpo_env import CartPoleWrapper -from rllib_mbmpo.mbmpo import MBMPO, MBMPOConfig - -import ray -from ray import air, tune -from ray.tune.registry import register_env - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - register_env( - "cartpole-mbmpo", - lambda env_ctx: TimeLimit(CartPoleWrapper(), max_episode_steps=200), - ) - - config = ( - MBMPOConfig() - # .rollouts(num_rollout_workers=7, num_envs_per_worker=20) - .framework("torch") - .environment("cartpole-mbmpo") - .rollouts(num_rollout_workers=4) - # .training(dynamics_model={"ensemble_size": 2}) - # ) - .training( - inner_adaptation_steps=1, - maml_optimizer_steps=8, - gamma=0.99, - lambda_=1.0, - lr=0.001, - clip_param=0.5, - kl_target=0.003, - kl_coeff=0.0000000001, - inner_lr=0.001, - num_maml_steps=15, - model={"fcnet_hiddens": [32, 32], "free_log_std": True}, - ) - ) - if args.run_as_test: - stop = { - "episode_reward_mean": 190, - "training_iteration": 20, - } - else: - stop = {"training_iteration": 1} - - tuner = tune.Tuner( - MBMPO, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop=stop, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() diff --git a/rllib_contrib/mbmpo/pyproject.toml b/rllib_contrib/mbmpo/pyproject.toml deleted file mode 100644 index 13bd725c0912d..0000000000000 --- a/rllib_contrib/mbmpo/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-mbmpo" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gym[accept-rom-license]", "gymnasium[mujoco]==0.26.3", "higher", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/mbmpo/requirements.txt b/rllib_contrib/mbmpo/requirements.txt deleted file mode 100644 index b07006a1b4ec6..0000000000000 --- a/rllib_contrib/mbmpo/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/mbmpo/src/rllib_mbmpo/env/__init__.py b/rllib_contrib/mbmpo/src/rllib_mbmpo/env/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/mbmpo/src/rllib_mbmpo/env/mbmpo_env.py b/rllib_contrib/mbmpo/src/rllib_mbmpo/env/mbmpo_env.py deleted file mode 100644 index 9ed162c3acf36..0000000000000 --- a/rllib_contrib/mbmpo/src/rllib_mbmpo/env/mbmpo_env.py +++ /dev/null @@ -1,104 +0,0 @@ -import numpy as np -from gymnasium.envs.classic_control import CartPoleEnv, PendulumEnv - -# MuJoCo may not be installed. -HalfCheetahEnv = HopperEnv = None - -try: - from gymnasium.envs.mujoco import HalfCheetahEnv, HopperEnv -except Exception: - pass - - -class CartPoleWrapper(CartPoleEnv): - """Wrapper for the CartPole-v1 environment. - - Adds an additional `reward` method for some model-based RL algos (e.g. - MB-MPO). - """ - - def reward(self, obs, action, obs_next): - # obs = batch * [pos, vel, angle, rotation_rate] - x = obs_next[:, 0] - theta = obs_next[:, 2] - - # 1.0 if we are still on, 0.0 if we are terminated due to bounds - # (angular or x-axis) being breached. - rew = 1.0 - ( - (x < -self.x_threshold) - | (x > self.x_threshold) - | (theta < -self.theta_threshold_radians) - | (theta > self.theta_threshold_radians) - ).astype(np.float32) - - return rew - - -class PendulumWrapper(PendulumEnv): - """Wrapper for the Pendulum-v1 environment. - - Adds an additional `reward` method for some model-based RL algos (e.g. - MB-MPO). - """ - - def reward(self, obs, action, obs_next): - # obs = [cos(theta), sin(theta), dtheta/dt] - # To get the angle back from obs: atan2(sin(theta), cos(theta)). - theta = np.arctan2(np.clip(obs[:, 1], -1.0, 1.0), np.clip(obs[:, 0], -1.0, 1.0)) - # Do everything in (B,) space (single theta-, action- and - # reward values). - a = np.clip(action, -self.max_torque, self.max_torque)[0] - costs = ( - self.angle_normalize(theta) ** 2 + 0.1 * obs[:, 2] ** 2 + 0.001 * (a**2) - ) - return -costs - - @staticmethod - def angle_normalize(x): - return ((x + np.pi) % (2 * np.pi)) - np.pi - - -class HalfCheetahWrapper(HalfCheetahEnv or object): - """Wrapper for the MuJoCo HalfCheetah-v2 environment. - - Adds an additional `reward` method for some model-based RL algos (e.g. - MB-MPO). - """ - - def reward(self, obs, action, obs_next): - if obs.ndim == 2 and action.ndim == 2: - assert obs.shape == obs_next.shape - forward_vel = obs_next[:, 8] - ctrl_cost = 0.1 * np.sum(np.square(action), axis=1) - reward = forward_vel - ctrl_cost - return np.minimum(np.maximum(-1000.0, reward), 1000.0) - else: - forward_vel = obs_next[8] - ctrl_cost = 0.1 * np.square(action).sum() - reward = forward_vel - ctrl_cost - return np.minimum(np.maximum(-1000.0, reward), 1000.0) - - -class HopperWrapper(HopperEnv or object): - """Wrapper for the MuJoCo Hopper-v2 environment. - - Adds an additional `reward` method for some model-based RL algos (e.g. - MB-MPO). - """ - - def reward(self, obs, action, obs_next): - alive_bonus = 1.0 - assert obs.ndim == 2 and action.ndim == 2 - assert obs.shape == obs_next.shape and action.shape[0] == obs.shape[0] - vel = obs_next[:, 5] - ctrl_cost = 1e-3 * np.sum(np.square(action), axis=1) - reward = vel + alive_bonus - ctrl_cost - return np.minimum(np.maximum(-1000.0, reward), 1000.0) - - -if __name__ == "__main__": - env = PendulumWrapper() - env.reset() - for _ in range(100): - env.step(env.action_space.sample()) - env.render() diff --git a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/__init__.py b/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/__init__.py deleted file mode 100644 index 9f18a9e346f50..0000000000000 --- a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from rllib_mbmpo.mbmpo.mbmpo import MBMPO, MBMPOConfig -from rllib_mbmpo.mbmpo.mbmpo_torch_policy import MBMPOTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = ["MBMPOConfig", "MBMPO", "MBMPOTorchPolicy"] - -register_trainable("rllib-contrib-mbmpo", MBMPO) diff --git a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/mbmpo.py b/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/mbmpo.py deleted file mode 100644 index a1e9f225e7eee..0000000000000 --- a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/mbmpo.py +++ /dev/null @@ -1,601 +0,0 @@ -import logging -from typing import List, Optional, Type - -import numpy as np -from rllib_mbmpo.mbmpo.model_ensemble import DynamicsEnsembleCustomModel -from rllib_mbmpo.mbmpo.utils import MBMPOExploration, calculate_gae_advantages - -import ray -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.env.env_context import EnvContext -from ray.rllib.env.wrappers.model_vector_env import model_vector_env -from ray.rllib.evaluation.metrics import ( - collect_episodes, - collect_metrics, - get_learner_stats, -) -from ray.rllib.evaluation.worker_set import WorkerSet -from ray.rllib.execution.common import ( - STEPS_SAMPLED_COUNTER, - STEPS_TRAINED_COUNTER, - STEPS_TRAINED_THIS_ITER_COUNTER, - _get_shared_metrics, -) -from ray.rllib.execution.metric_ops import CollectMetrics -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import ( - DEFAULT_POLICY_ID, - SampleBatch, - concat_samples, - convert_ma_batch_to_sample_batch, -) -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO -from ray.rllib.utils.sgd import standardized -from ray.rllib.utils.torch_utils import convert_to_torch_tensor -from ray.rllib.utils.typing import EnvType -from ray.util.iter import LocalIterator, from_actors - -logger = logging.getLogger(__name__) - - -class MBMPOConfig(AlgorithmConfig): - r"""Defines a configuration class from which an MBMPO Algorithm can be built. - - Example: - >>> from rllib_mbmpo.mbmpo import MBMPOConfig - >>> config = MBMPOConfig() - >>> config = config.training(lr=0.0003, train_batch_size=512) # doctest: +SKIP - >>> config = config.resources(num_gpus=4) # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=64) # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from rllib_mbmpo.mbmpo import MBMPOConfig - >>> from ray import air - >>> from ray import tune - >>> config = MBMPOConfig() - >>> # Print out some default values. - >>> print(config.vtrace) # doctest: +SKIP - >>> # Update the config object. - >>> config = config\ # doctest: +SKIP - ... .training(lr=tune.grid_search([0.0001, 0.0003]), grad_clip=20.0) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "AlphaStar", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a MBMPOConfig instance.""" - super().__init__(algo_class=algo_class or MBMPO) - - # fmt: off - # __sphinx_doc_begin__ - - # MBMPO specific config settings: - # If true, use the Generalized Advantage Estimator (GAE) - # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. - self.use_gae = True - # GAE(lambda) parameter. - self.lambda_ = 1.0 - # Initial coefficient for KL divergence. - self.kl_coeff = 0.0005 - - # Coefficient of the value function loss. - self.vf_loss_coeff = 0.5 - # Coefficient of the entropy regularizer. - self.entropy_coeff = 0.0 - # PPO clip parameter. - self.clip_param = 0.5 - # Clip param for the value function. Note that this is sensitive to the - # scale of the rewards. If your expected V is large, increase this. - self.vf_clip_param = 10.0 - # If specified, clip the global norm of gradients by this amount. - self.grad_clip = None - # Target value for KL divergence. - self.kl_target = 0.01 - # Number of Inner adaptation steps for the MAML algorithm. - self.inner_adaptation_steps = 1 - # Number of MAML steps per meta-update iteration (PPO steps). - self.maml_optimizer_steps = 8 - # Inner adaptation step size. - self.inner_lr = 1e-3 - # Dynamics ensemble hyperparameters. - self.dynamics_model = { - "custom_model": DynamicsEnsembleCustomModel, - # Number of Transition-Dynamics (TD) models in the ensemble. - "ensemble_size": 5, - # Hidden layers for each model in the TD-model ensemble. - "fcnet_hiddens": [512, 512, 512], - # Model learning rate. - "lr": 1e-3, - # Max number of training epochs per MBMPO iter. - "train_epochs": 500, - # Model batch size. - "batch_size": 500, - # Training/validation split. - "valid_split_ratio": 0.2, - # Normalize data (obs, action, and deltas). - "normalize_data": True, - } - # Workers sample from dynamics models, not from actual envs. - self.custom_vector_env = model_vector_env - # How many iterations through MAML per MBMPO iteration. - self.num_maml_steps = 10 - - # Override some of AlgorithmConfig's default values with MBMPO-specific - # values. - self.batch_mode = "complete_episodes" - self.num_rollout_workers = 2 - # Size of batches collected from each worker. - self.rollout_fragment_length = 200 - # Do create an actual env on the local worker (worker-idx=0). - self.create_env_on_local_worker = True - # Step size of SGD. - self.lr = 1e-3 - # Exploration for MB-MPO is based on StochasticSampling, but uses 8000 - # random timesteps up-front for worker=0. - self.exploration_config = { - "type": MBMPOExploration, - "random_timesteps": 8000, - } - - # __sphinx_doc_end__ - # fmt: on - - self.vf_share_layers = DEPRECATED_VALUE - self._disable_execution_plan_api = False - - @override(AlgorithmConfig) - def training( - self, - *, - use_gae: Optional[float] = NotProvided, - lambda_: Optional[float] = NotProvided, - kl_coeff: Optional[float] = NotProvided, - vf_loss_coeff: Optional[float] = NotProvided, - entropy_coeff: Optional[float] = NotProvided, - clip_param: Optional[float] = NotProvided, - vf_clip_param: Optional[float] = NotProvided, - grad_clip: Optional[float] = NotProvided, - kl_target: Optional[float] = NotProvided, - inner_adaptation_steps: Optional[int] = NotProvided, - maml_optimizer_steps: Optional[int] = NotProvided, - inner_lr: Optional[float] = NotProvided, - dynamics_model: Optional[dict] = NotProvided, - custom_vector_env: Optional[type] = NotProvided, - num_maml_steps: Optional[int] = NotProvided, - **kwargs, - ) -> "MBMPOConfig": - """Sets the training related configuration. - - Args: - use_gae: If true, use the Generalized Advantage Estimator (GAE) - with a value function, see https://arxiv.org/pdf/1506.02438.pdf. - lambda_: The GAE (lambda) parameter. - kl_coeff: Initial coefficient for KL divergence. - vf_loss_coeff: Coefficient of the value function loss. - entropy_coeff: Coefficient of the entropy regularizer. - clip_param: PPO clip parameter. - vf_clip_param: Clip param for the value function. Note that this is - sensitive to the scale of the rewards. If your expected V is large, - increase this. - grad_clip: If specified, clip the global norm of gradients by this amount. - kl_target: Target value for KL divergence. - inner_adaptation_steps: Number of Inner adaptation steps for the MAML - algorithm. - maml_optimizer_steps: Number of MAML steps per meta-update iteration - (PPO steps). - inner_lr: Inner adaptation step size. - dynamics_model: Dynamics ensemble hyperparameters. - custom_vector_env: Workers sample from dynamics models, not from actual - envs. - num_maml_steps: How many iterations through MAML per MBMPO iteration. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if use_gae is not NotProvided: - self.use_gae = use_gae - if lambda_ is not NotProvided: - self.lambda_ = lambda_ - if kl_coeff is not NotProvided: - self.kl_coeff = kl_coeff - if vf_loss_coeff is not NotProvided: - self.vf_loss_coeff = vf_loss_coeff - if entropy_coeff is not NotProvided: - self.entropy_coeff = entropy_coeff - if clip_param is not NotProvided: - self.clip_param = clip_param - if vf_clip_param is not NotProvided: - self.vf_clip_param = vf_clip_param - if grad_clip is not NotProvided: - self.grad_clip = grad_clip - if kl_target is not NotProvided: - self.kl_target = kl_target - if inner_adaptation_steps is not NotProvided: - self.inner_adaptation_steps = inner_adaptation_steps - if maml_optimizer_steps is not NotProvided: - self.maml_optimizer_steps = maml_optimizer_steps - if inner_lr is not NotProvided: - self.inner_lr = inner_lr - if dynamics_model is not NotProvided: - self.dynamics_model.update(dynamics_model) - if custom_vector_env is not NotProvided: - self.custom_vector_env = custom_vector_env - if num_maml_steps is not NotProvided: - self.num_maml_steps = num_maml_steps - - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if self.num_gpus > 1: - raise ValueError("`num_gpus` > 1 not yet supported for MB-MPO!") - if self.framework_str != "torch": - raise ValueError( - "MB-MPO only supported in PyTorch so far! Try setting config. " - "framework('torch')." - ) - if self.inner_adaptation_steps <= 0: - raise ValueError("Inner adaptation steps must be >=1!") - if self.maml_optimizer_steps <= 0: - raise ValueError("PPO steps for meta-update needs to be >=0!") - if self.entropy_coeff < 0: - raise ValueError("`entropy_coeff` must be >=0.0!") - if self.batch_mode != "complete_episodes": - raise ValueError("`batch_mode=truncate_episodes` not supported!") - if self.num_rollout_workers <= 0: - raise ValueError("Must have at least 1 worker/task.") - if self.create_env_on_local_worker is False: - raise ValueError( - "Must have an actual Env created on the local worker process!" - "Try setting `config.environment(" - "create_env_on_local_worker=True)`." - ) - - -# Select Metric Keys for MAML Stats Tracing -METRICS_KEYS = ["episode_reward_mean", "episode_reward_min", "episode_reward_max"] - - -class MetaUpdate: - def __init__(self, workers, num_steps, maml_steps, metric_gen): - """Computes the MetaUpdate step in MAML. - - Adapted for MBMPO for multiple MAML Iterations. - - Args: - workers: Set of Workers - num_steps: Number of meta-update steps per MAML Iteration - maml_steps: MAML Iterations per MBMPO Iteration - metric_gen: Generates metrics dictionary - - Returns: - metrics: MBMPO metrics for logging. - """ - self.workers = workers - self.num_steps = num_steps - self.step_counter = 0 - self.maml_optimizer_steps = maml_steps - self.metric_gen = metric_gen - self.metrics = {} - - def __call__(self, data_tuple): - """Args: - data_tuple: 1st element is samples collected from MAML - Inner adaptation steps and 2nd element is accumulated metrics - """ - # Metaupdate Step. - print("Meta-Update Step") - samples = data_tuple[0] - adapt_metrics_dict = data_tuple[1] - self.postprocess_metrics( - adapt_metrics_dict, prefix="MAMLIter{}".format(self.step_counter) - ) - - # MAML Meta-update. - fetches = None - for i in range(self.maml_optimizer_steps): - fetches = self.workers.local_worker().learn_on_batch(samples) - learner_stats = get_learner_stats(fetches) - - # Update KLs. - def update(pi, pi_id): - assert "inner_kl" not in learner_stats, ( - "inner_kl should be nested under policy id key", - learner_stats, - ) - if pi_id in learner_stats: - assert "inner_kl" in learner_stats[pi_id], (learner_stats, pi_id) - pi.update_kls(learner_stats[pi_id]["inner_kl"]) - else: - logger.warning("No data for {}, not updating kl".format(pi_id)) - - self.workers.local_worker().foreach_policy_to_train(update) - - # Modify Reporting Metrics. - metrics = _get_shared_metrics() - metrics.info[LEARNER_INFO] = fetches - metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = samples.count - metrics.counters[STEPS_TRAINED_COUNTER] += samples.count - - if self.step_counter == self.num_steps - 1: - td_metric = self.workers.local_worker().foreach_policy(fit_dynamics)[0] - - # Sync workers with meta policy. - self.workers.sync_weights() - - # Sync TD Models with workers. - sync_ensemble(self.workers) - sync_stats(self.workers) - - metrics.counters[STEPS_SAMPLED_COUNTER] = td_metric[STEPS_SAMPLED_COUNTER] - - # Modify to CollectMetrics. - res = self.metric_gen.__call__(None) - res.update(self.metrics) - self.step_counter = 0 - print("MB-MPO Iteration Completed") - return [res] - else: - print("MAML Iteration {} Completed".format(self.step_counter)) - self.step_counter += 1 - - # Sync workers with meta policy - print("Syncing Weights with Workers") - self.workers.sync_weights() - return [] - - def postprocess_metrics(self, metrics, prefix=""): - """Appends prefix to current metrics - - Args: - metrics: Dictionary of current metrics - prefix: Prefix string to be appended - """ - for key in metrics.keys(): - self.metrics[prefix + "_" + key] = metrics[key] - - -def post_process_metrics(prefix, workers, metrics): - """Update current dataset metrics and filter out specific keys. - - Args: - prefix: Prefix string to be appended - workers: Set of workers - metrics: Current metrics dictionary - """ - res = collect_metrics(workers=workers) - for key in METRICS_KEYS: - metrics[prefix + "_" + key] = res[key] - return metrics - - -def inner_adaptation(workers: WorkerSet, samples: List[SampleBatch]): - """Performs one gradient descend step on each remote worker. - - Args: - workers: The WorkerSet of the Algorithm. - samples (List[SampleBatch]): The list of SampleBatches to perform - a training step on (one for each remote worker). - """ - - for i, e in enumerate(workers.remote_workers()): - e.learn_on_batch.remote(samples[i]) - - -def fit_dynamics(policy, pid): - return policy.dynamics_model.fit() - - -def sync_ensemble(workers: WorkerSet) -> None: - """Syncs dynamics ensemble weights from driver (main) to workers. - - Args: - workers: Set of workers, including driver (main). - """ - - def get_ensemble_weights(worker): - policy_map = worker.policy_map - policies = policy_map.keys() - - def policy_ensemble_weights(policy): - model = policy.dynamics_model - return {k: v.cpu().detach().numpy() for k, v in model.state_dict().items()} - - return { - pid: policy_ensemble_weights(policy) - for pid, policy in policy_map.items() - if pid in policies - } - - def set_ensemble_weights(policy, pid, weights): - weights = weights[pid] - weights = convert_to_torch_tensor(weights, device=policy.device) - model = policy.dynamics_model - model.load_state_dict(weights) - - if workers.remote_workers(): - weights = ray.put(get_ensemble_weights(workers.local_worker())) - set_func = ray.put(set_ensemble_weights) - for e in workers.remote_workers(): - e.foreach_policy.remote(set_func, weights=weights) - - -def sync_stats(workers: WorkerSet) -> None: - def get_normalizations(worker): - policy = worker.policy_map[DEFAULT_POLICY_ID] - return policy.dynamics_model.normalizations - - def set_normalizations(policy, pid, normalizations): - policy.dynamics_model.set_norms(normalizations) - - if workers.remote_workers(): - normalization_dict = ray.put(get_normalizations(workers.local_worker())) - set_func = ray.put(set_normalizations) - for e in workers.remote_workers(): - e.foreach_policy.remote(set_func, normalizations=normalization_dict) - - -def post_process_samples(samples, config: AlgorithmConfig): - # Instead of using NN for value function, we use regression - split_lst = [] - for sample in samples: - indexes = np.asarray(sample[SampleBatch.TERMINATEDS]).nonzero()[0] - indexes = indexes + 1 - - reward_list = np.split(sample[SampleBatch.REWARDS], indexes)[:-1] - observation_list = np.split(sample[SampleBatch.OBS], indexes)[:-1] - - paths = [] - for i in range(0, len(reward_list)): - paths.append( - {"rewards": reward_list[i], "observations": observation_list[i]} - ) - - paths = calculate_gae_advantages(paths, config["gamma"], config["lambda"]) - - advantages = np.concatenate([path["advantages"] for path in paths]) - sample["advantages"] = standardized(advantages) - split_lst.append(sample.count) - return samples, split_lst - - -class MBMPO(Algorithm): - """Model-Based Meta Policy Optimization (MB-MPO) Algorithm. - - This file defines the distributed Algorithm class for model-based meta - policy optimization. - See `mbmpo_[tf|torch]_policy.py` for the definition of the policy loss. - - Detailed documentation: - https://docs.ray.io/en/master/rllib-algorithms.html#mbmpo - """ - - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return MBMPOConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - from rllib_mbmpo.mbmpo.mbmpo_torch_policy import MBMPOTorchPolicy - - return MBMPOTorchPolicy - - @staticmethod - @override(Algorithm) - def execution_plan( - workers: WorkerSet, config: AlgorithmConfig, **kwargs - ) -> LocalIterator[dict]: - assert ( - len(kwargs) == 0 - ), "MBMPO execution_plan does NOT take any additional parameters" - - # Train TD Models on the driver. - workers.local_worker().foreach_policy(fit_dynamics) - - # Sync driver's policy with workers. - workers.sync_weights() - - # Sync TD Models and normalization stats with workers - sync_ensemble(workers) - sync_stats(workers) - - # Dropping metrics from the first iteration - _ = collect_episodes(workers=workers, timeout_seconds=9999) - - # Metrics Collector. - metric_collect = CollectMetrics( - workers, - min_history=0, - timeout_seconds=config.metrics_episode_collection_timeout_s, - ) - - num_inner_steps = config.inner_adaptation_steps - - def inner_adaptation_steps(itr): - buf = [] - split = [] - metrics = {} - for samples in itr: - print("Collecting Samples, Inner Adaptation {}".format(len(split))) - # Processing Samples (Standardize Advantages) - samples = [convert_ma_batch_to_sample_batch(batch) for batch in samples] - samples, split_lst = post_process_samples(samples, config) - - buf.extend(samples) - split.append(split_lst) - - adapt_iter = len(split) - 1 - prefix = "DynaTrajInner_" + str(adapt_iter) - metrics = post_process_metrics(prefix, workers, metrics) - - if len(split) > num_inner_steps: - out = concat_samples(buf) - out["split"] = np.array(split) - buf = [] - split = [] - - yield out, metrics - metrics = {} - else: - inner_adaptation(workers, samples) - - # Iterator for Inner Adaptation Data gathering (from pre->post - # adaptation). - rollouts = from_actors(workers.remote_workers()) - rollouts = rollouts.batch_across_shards() - rollouts = rollouts.transform(inner_adaptation_steps) - - # Meta update step with outer combine loop for multiple MAML - # iterations. - train_op = rollouts.combine( - MetaUpdate( - workers, - config.num_maml_steps, - config.maml_optimizer_steps, - metric_collect, - ) - ) - return train_op - - @staticmethod - @override(Algorithm) - def validate_env(env: EnvType, env_context: EnvContext) -> None: - """Validates the local_worker's env object (after creation). - - Args: - env: The env object to check (for worker=0 only). - env_context: The env context used for the instantiation of - the local worker's env (worker=0). - - Raises: - ValueError: In case something is wrong with the config. - """ - if not hasattr(env, "reward") or not callable(env.reward): - raise ValueError( - f"Env {env} doest not have a `reward()` method, needed for " - "MB-MPO! This `reward()` method should return " - ) diff --git a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/mbmpo_torch_policy.py b/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/mbmpo_torch_policy.py deleted file mode 100644 index 38585c44f03ed..0000000000000 --- a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/mbmpo_torch_policy.py +++ /dev/null @@ -1,89 +0,0 @@ -import logging -from typing import Tuple, Type - -from gymnasium.spaces import Box, Discrete - -from ray.rllib.algorithms.maml.maml_torch_policy import MAMLTorchPolicy -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper -from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import get_device - -torch, nn = try_import_torch() - -logger = logging.getLogger(__name__) - - -class MBMPOTorchPolicy(MAMLTorchPolicy): - def __init__(self, observation_space, action_space, config): - # Validate spaces. - # Only support single Box or single Discrete spaces. - if not isinstance(action_space, (Box, Discrete)): - raise UnsupportedSpaceException( - "Action space ({}) of {} is not supported for " - "MB-MPO. Must be [Box|Discrete].".format(action_space, self) - ) - # If Box, make sure it's a 1D vector space. - elif isinstance(action_space, Box) and len(action_space.shape) > 1: - raise UnsupportedSpaceException( - "Action space ({}) of {} has multiple dimensions " - "{}. ".format(action_space, self, action_space.shape) - + "Consider reshaping this into a single dimension Box space " - "or using the multi-agent API." - ) - - super().__init__(observation_space, action_space, config) - - def make_model_and_action_dist( - self, - ) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]: - """Constructs the necessary ModelV2 and action dist class for the Policy. - - Args: - obs_space (gym.spaces.Space): The observation space. - action_space (gym.spaces.Space): The action space. - config: The SAC trainer's config dict. - - Returns: - ModelV2: The ModelV2 to be used by the Policy. Note: An additional - target model will be created in this function and assigned to - `policy.target_model`. - """ - # Get the output distribution class for predicting rewards and next-obs. - self.distr_cls_next_obs, num_outputs = ModelCatalog.get_action_dist( - self.observation_space, - self.config, - dist_type="deterministic", - framework="torch", - ) - - # Build one dynamics model if we are a Worker. - # If we are the main MAML learner, build n (num_workers) dynamics Models - # for being able to create checkpoints for the current state of training. - device = get_device(self.config) - - self.dynamics_model = ModelCatalog.get_model_v2( - self.observation_space, - self.action_space, - num_outputs=num_outputs, - model_config=self.config["dynamics_model"], - framework="torch", - name="dynamics_ensemble", - ).to(device) - - action_dist, num_outputs = ModelCatalog.get_action_dist( - self.action_space, self.config, framework="torch" - ) - # Create the pi-model and register it with the Policy. - self.pi = ModelCatalog.get_model_v2( - self.observation_space, - self.action_space, - num_outputs=num_outputs, - model_config=self.config["model"], - framework="torch", - name="policy_model", - ) - - return self.pi, action_dist diff --git a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/model_ensemble.py b/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/model_ensemble.py deleted file mode 100644 index 6a9288281b60d..0000000000000 --- a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/model_ensemble.py +++ /dev/null @@ -1,357 +0,0 @@ -import gymnasium as gym -import numpy as np -from gymnasium.spaces import Box, Discrete - -from ray.rllib.evaluation.rollout_worker import get_global_worker -from ray.rllib.execution.common import STEPS_SAMPLED_COUNTER -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.policy.sample_batch import SampleBatch, convert_ma_batch_to_sample_batch -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import convert_to_torch_tensor -from ray.rllib.utils.typing import SampleBatchType - -torch, nn = try_import_torch() - - -class TDModel(nn.Module): - """Transition Dynamics Model (FC Network with Weight Norm)""" - - def __init__( - self, - input_size, - output_size, - hidden_layers=(512, 512), - hidden_nonlinearity=None, - output_nonlinearity=None, - weight_normalization=False, - use_bias=True, - ): - - super().__init__() - assert len(hidden_layers) >= 1 - - if not hidden_nonlinearity: - hidden_nonlinearity = nn.ReLU - - if weight_normalization: - weight_norm = nn.utils.weight_norm - - self.layers = [] - cur_size = input_size - for h_size in hidden_layers: - layer = nn.Linear(cur_size, h_size, bias=use_bias) - if weight_normalization: - layer = weight_norm(layer) - self.layers.append(layer) - if hidden_nonlinearity: - self.layers.append(hidden_nonlinearity()) - cur_size = h_size - - layer = nn.Linear(cur_size, output_size, bias=use_bias) - if weight_normalization: - layer = weight_norm(layer) - self.layers.append(layer) - if output_nonlinearity: - self.layers.append(output_nonlinearity()) - - self.model = nn.Sequential(*self.layers) - - def forward(self, x): - return self.model(x) - - -if torch: - - class TDDataset(torch.utils.data.Dataset): - def __init__(self, dataset: SampleBatchType, norms): - self.count = dataset.count - obs = dataset[SampleBatch.CUR_OBS] - actions = dataset[SampleBatch.ACTIONS] - delta = dataset[SampleBatch.NEXT_OBS] - obs - - if norms: - obs = normalize(obs, norms[SampleBatch.CUR_OBS]) - actions = normalize(actions, norms[SampleBatch.ACTIONS]) - delta = normalize(delta, norms["delta"]) - - self.x = np.concatenate([obs, actions], axis=1) - self.y = delta - - def __len__(self): - return self.count - - def __getitem__(self, index): - return self.x[index], self.y[index] - - -def normalize(data_array, stats): - mean, std = stats - return (data_array - mean) / (std + 1e-10) - - -def denormalize(data_array, stats): - mean, std = stats - return data_array * (std + 1e-10) + mean - - -def mean_std_stats(dataset: SampleBatchType): - norm_dict = {} - obs = dataset[SampleBatch.CUR_OBS] - act = dataset[SampleBatch.ACTIONS] - delta = dataset[SampleBatch.NEXT_OBS] - obs - - norm_dict[SampleBatch.CUR_OBS] = (np.mean(obs, axis=0), np.std(obs, axis=0)) - norm_dict[SampleBatch.ACTIONS] = (np.mean(act, axis=0), np.std(act, axis=0)) - norm_dict["delta"] = (np.mean(delta, axis=0), np.std(delta, axis=0)) - - return norm_dict - - -def process_samples(samples: SampleBatchType): - filter_keys = [SampleBatch.CUR_OBS, SampleBatch.ACTIONS, SampleBatch.NEXT_OBS] - filtered = {} - for key in filter_keys: - filtered[key] = samples[key] - return SampleBatch(filtered) - - -class DynamicsEnsembleCustomModel(TorchModelV2, nn.Module): - """Represents an ensemble of transition dynamics (TD) models.""" - - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - """Initializes a DynamicEnsemble object.""" - nn.Module.__init__(self) - if isinstance(action_space, Discrete): - input_space = gym.spaces.Box( - obs_space.low[0], - obs_space.high[0], - shape=(obs_space.shape[0] + action_space.n,), - ) - elif isinstance(action_space, Box): - input_space = gym.spaces.Box( - obs_space.low[0], - obs_space.high[0], - shape=(obs_space.shape[0] + action_space.shape[0],), - ) - else: - raise NotImplementedError - super(DynamicsEnsembleCustomModel, self).__init__( - input_space, action_space, num_outputs, model_config, name - ) - - # Keep the original Env's observation space for possible clipping. - self.env_obs_space = obs_space - - self.num_models = model_config["ensemble_size"] - self.max_epochs = model_config["train_epochs"] - self.lr = model_config["lr"] - self.valid_split = model_config["valid_split_ratio"] - self.batch_size = model_config["batch_size"] - self.normalize_data = model_config["normalize_data"] - self.normalizations = {} - self.dynamics_ensemble = [ - TDModel( - input_size=input_space.shape[0], - output_size=obs_space.shape[0], - hidden_layers=model_config["fcnet_hiddens"], - hidden_nonlinearity=nn.ReLU, - output_nonlinearity=None, - weight_normalization=True, - ) - for _ in range(self.num_models) - ] - - for i in range(self.num_models): - self.add_module("TD-model-" + str(i), self.dynamics_ensemble[i]) - self.replay_buffer_max = 10000 - self.replay_buffer = None - self.optimizers = [ - torch.optim.Adam(self.dynamics_ensemble[i].parameters(), lr=self.lr) - for i in range(self.num_models) - ] - # Metric Reporting - self.metrics = {} - self.metrics[STEPS_SAMPLED_COUNTER] = 0 - - # For each worker, choose a random model to choose trajectories from - worker_index = get_global_worker().worker_index - self.sample_index = int((worker_index - 1) / self.num_models) - self.global_itr = 0 - - def forward(self, x): - """Outputs the delta between next and current observation.""" - return self.dynamics_ensemble[self.sample_index](x) - - # Loss functions for each TD model in Ensemble (Standard L2 Loss) - def loss(self, x, y): - xs = torch.chunk(x, self.num_models) - ys = torch.chunk(y, self.num_models) - return [ - torch.mean(torch.pow(self.dynamics_ensemble[i](xs[i]) - ys[i], 2.0)) - for i in range(self.num_models) - ] - - # Fitting Dynamics Ensembles per MBMPO Iter - def fit(self): - # Add env samples to Replay Buffer - local_worker = get_global_worker() - for pid, pol in local_worker.policy_map.items(): - pol.view_requirements[SampleBatch.NEXT_OBS].used_for_training = True - new_samples = local_worker.sample() - new_samples = convert_ma_batch_to_sample_batch(new_samples) - # Initial Exploration of 8000 timesteps - if not self.global_itr: - extra = local_worker.sample() - extra = convert_ma_batch_to_sample_batch(extra) - new_samples.concat(extra) - - # Process Samples - new_samples = process_samples(new_samples) - if isinstance(self.action_space, Discrete): - act = new_samples["actions"] - new_act = np.zeros((act.size, act.max() + 1)) - new_act[np.arange(act.size), act] = 1 - new_samples["actions"] = new_act.astype("float32") - - if not self.replay_buffer: - self.replay_buffer = new_samples - else: - self.replay_buffer = self.replay_buffer.concat(new_samples) - - # Keep Replay Buffer Size Constant - self.replay_buffer = self.replay_buffer.slice( - start=-self.replay_buffer_max, end=None - ) - - if self.normalize_data: - self.normalizations = mean_std_stats(self.replay_buffer) - - # Keep Track of Timesteps from Real Environment Timesteps Sampled - self.metrics[STEPS_SAMPLED_COUNTER] += new_samples.count - - # Create Train and Val Datasets for each TD model - train_loaders = [] - val_loaders = [] - for i in range(self.num_models): - t, v = self.split_train_val(self.replay_buffer) - train_loaders.append( - torch.utils.data.DataLoader( - TDDataset(t, self.normalizations), - batch_size=self.batch_size, - shuffle=True, - ) - ) - val_loaders.append( - torch.utils.data.DataLoader( - TDDataset(v, self.normalizations), batch_size=v.count, shuffle=False - ) - ) - - # List of which models in ensemble to train - indexes = list(range(self.num_models)) - - valid_loss_roll_avg = None - roll_avg_persitency = 0.95 - - def convert_to_str(lst): - return " ".join([str(elem) for elem in lst]) - - device = next(iter(self.dynamics_ensemble[i].parameters()))[0].device - for epoch in range(self.max_epochs): - # Training - for data in zip(*train_loaders): - x = torch.cat([d[0] for d in data], dim=0).to(device) - y = torch.cat([d[1] for d in data], dim=0).to(device) - train_losses = self.loss(x, y) - for ind in indexes: - self.optimizers[ind].zero_grad() - train_losses[ind].backward() - self.optimizers[ind].step() - - for ind in range(self.num_models): - train_losses[ind] = train_losses[ind].detach().cpu().numpy() - - # Validation - val_lists = [] - for data in zip(*val_loaders): - x = torch.cat([d[0] for d in data], dim=0).to(device) - y = torch.cat([d[1] for d in data], dim=0).to(device) - val_losses = self.loss(x, y) - val_lists.append(val_losses) - - for ind in indexes: - self.optimizers[ind].zero_grad() - - for ind in range(self.num_models): - val_losses[ind] = val_losses[ind].detach().cpu().numpy() - - val_lists = np.array(val_lists) - avg_val_losses = np.mean(val_lists, axis=0) - - if valid_loss_roll_avg is None: - # Make sure that training doesnt end first epoch - valid_loss_roll_avg = 1.5 * avg_val_losses - valid_loss_roll_avg_prev = 2.0 * avg_val_losses - - valid_loss_roll_avg = ( - roll_avg_persitency * valid_loss_roll_avg - + (1.0 - roll_avg_persitency) * avg_val_losses - ) - - print( - "Training Dynamics Ensemble - Epoch #%i:" - "Train loss: %s, Valid Loss: %s, Moving Avg Valid Loss: %s" - % ( - epoch, - convert_to_str(train_losses), - convert_to_str(avg_val_losses), - convert_to_str(valid_loss_roll_avg), - ) - ) - for i in range(self.num_models): - if ( - valid_loss_roll_avg_prev[i] < valid_loss_roll_avg[i] - or epoch == self.max_epochs - 1 - ) and i in indexes: - indexes.remove(i) - print("Stopping Training of Model %i" % i) - valid_loss_roll_avg_prev = valid_loss_roll_avg - if len(indexes) == 0: - break - - self.global_itr += 1 - # Returns Metric Dictionary - return self.metrics - - def split_train_val(self, samples: SampleBatchType): - dataset_size = samples.count - indices = np.arange(dataset_size) - np.random.shuffle(indices) - split_idx = int(dataset_size * (1 - self.valid_split)) - idx_train = indices[:split_idx] - idx_test = indices[split_idx:] - - train = {} - val = {} - for key in samples.keys(): - train[key] = samples[key][idx_train, :] - val[key] = samples[key][idx_test, :] - return SampleBatch(train), SampleBatch(val) - - def predict_model_batches(self, obs, actions, device=None): - """Used by worker who gather trajectories via TD models.""" - pre_obs = obs - if self.normalize_data: - obs = normalize(obs, self.normalizations[SampleBatch.CUR_OBS]) - actions = normalize(actions, self.normalizations[SampleBatch.ACTIONS]) - x = np.concatenate([obs, actions], axis=-1) - x = convert_to_torch_tensor(x, device=device) - delta = self.forward(x).detach().cpu().numpy() - if self.normalize_data: - delta = denormalize(delta, self.normalizations["delta"]) - new_obs = pre_obs + delta - clipped_obs = np.clip(new_obs, self.env_obs_space.low, self.env_obs_space.high) - return clipped_obs - - def set_norms(self, normalization_dict): - self.normalizations = normalization_dict diff --git a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/utils.py b/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/utils.py deleted file mode 100644 index b9ddeac3926f5..0000000000000 --- a/rllib_contrib/mbmpo/src/rllib_mbmpo/mbmpo/utils.py +++ /dev/null @@ -1,104 +0,0 @@ -import numpy as np - -from ray.rllib.evaluation.postprocessing import discount_cumsum -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.utils.exploration.stochastic_sampling import StochasticSampling -from ray.rllib.utils.framework import try_import_tf, try_import_torch - -tf1, tf, tfv = try_import_tf() -torch, _ = try_import_torch() - - -class LinearFeatureBaseline: - def __init__(self, reg_coeff=1e-5): - self._coeffs = None - self._reg_coeff = reg_coeff - - def get_param_values(self, **tags): - return self._coeffs - - def set_param_values(self, val, **tags): - self._coeffs = val - - def _features(self, path): - o = np.clip(path["observations"], -10, 10) - ll = len(path["rewards"]) - al = np.arange(ll).reshape(-1, 1) / 100.0 - return np.concatenate( - [o, o**2, al, al**2, al**3, np.ones((ll, 1))], axis=1 - ) - - def fit(self, paths): - featmat = np.concatenate([self._features(path) for path in paths]) - returns = np.concatenate([path["returns"] for path in paths]) - reg_coeff = self._reg_coeff - for _ in range(5): - self._coeffs = np.linalg.lstsq( - featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), - featmat.T.dot(returns), - )[0] - if not np.any(np.isnan(self._coeffs)): - break - reg_coeff *= 10 - - def predict(self, path): - if self._coeffs is None: - return np.zeros(len(path["rewards"])) - return self._features(path).dot(self._coeffs) - - -def calculate_gae_advantages(paths, discount, gae_lambda): - baseline = LinearFeatureBaseline() - - for idx, path in enumerate(paths): - path["returns"] = discount_cumsum(path["rewards"], discount) - - baseline.fit(paths) - all_path_baselines = [baseline.predict(path) for path in paths] - - for idx, path in enumerate(paths): - path_baselines = np.append(all_path_baselines[idx], 0) - deltas = path["rewards"] + discount * path_baselines[1:] - path_baselines[:-1] - path["advantages"] = discount_cumsum(deltas, discount * gae_lambda) - return paths - - -class MBMPOExploration(StochasticSampling): - """Like StochasticSampling, but only worker=0 uses Random for n timesteps.""" - - def __init__( - self, - action_space, - *, - framework: str, - model: ModelV2, - random_timesteps: int = 8000, - **kwargs - ): - """Initializes a MBMPOExploration instance. - - Args: - action_space: The gym action space used by the environment. - framework: One of None, "tf", "torch". - model (ModelV2): The ModelV2 used by the owning Policy. - random_timesteps: The number of timesteps for which to act - completely randomly. Only after this number of timesteps, - actual samples will be drawn to get exploration actions. - NOTE: For MB-MPO, only worker=0 will use this setting. All - other workers will not use random actions ever. - """ - super().__init__( - action_space, - model=model, - framework=framework, - random_timesteps=random_timesteps, - **kwargs - ) - - assert ( - self.framework == "torch" - ), "MBMPOExploration currently only supports torch!" - - # Switch off Random sampling for all non-driver workers. - if self.worker_index > 0: - self.random_timesteps = 0 diff --git a/rllib_contrib/mbmpo/tests/test_mbmpo.py b/rllib_contrib/mbmpo/tests/test_mbmpo.py deleted file mode 100644 index 7c34a6c77fb26..0000000000000 --- a/rllib_contrib/mbmpo/tests/test_mbmpo.py +++ /dev/null @@ -1,57 +0,0 @@ -import unittest - -import rllib_mbmpo.mbmpo.mbmpo as mbmpo -from gymnasium.wrappers import TimeLimit -from rllib_mbmpo.env.mbmpo_env import CartPoleWrapper - -import ray -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) -from ray.tune.registry import register_env - - -class TestMBMPO(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - register_env( - "cartpole-mbmpo", - lambda env_ctx: TimeLimit(CartPoleWrapper(), max_episode_steps=200), - ) - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_mbmpo_compilation(self): - """Test whether MBMPO can be built with all frameworks.""" - config = ( - mbmpo.MBMPOConfig() - .environment("cartpole-mbmpo") - .rollouts(num_rollout_workers=2) - .training(dynamics_model={"ensemble_size": 2}) - ) - num_iterations = 1 - - # Test for torch framework (tf not implemented yet). - for _ in framework_iterator(config, frameworks="torch"): - algo = config.build() - - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - - check_compute_single_action(algo, include_prev_action_reward=False) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/mbmpo/tuned_examples/__init__.py b/rllib_contrib/mbmpo/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/mbmpo/tuned_examples/cartpole-mbmpo.yaml b/rllib_contrib/mbmpo/tuned_examples/cartpole-mbmpo.yaml deleted file mode 100644 index fa7c1c5bd3218..0000000000000 --- a/rllib_contrib/mbmpo/tuned_examples/cartpole-mbmpo.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-mbmpo: - env: ray.rllib.examples.env.mbmpo_env.CartPoleWrapper - run: MBMPO - stop: - sampler_results/episode_reward_mean: 190 - training_iteration: 20 - config: - # Only supported in torch right now. - framework: torch - num_envs_per_worker: 20 - inner_adaptation_steps: 1 - maml_optimizer_steps: 8 - gamma: 0.99 - lambda: 1.0 - lr: 0.001 - clip_param: 0.5 - kl_target: 0.003 - kl_coeff: 0.0000000001 - num_workers: 10 - num_gpus: 0 - inner_lr: 0.001 - clip_actions: False - num_maml_steps: 15 - model: - fcnet_hiddens: [32, 32] - free_log_std: True diff --git a/rllib_contrib/mbmpo/tuned_examples/halfcheetah-mbmpo.yaml b/rllib_contrib/mbmpo/tuned_examples/halfcheetah-mbmpo.yaml deleted file mode 100644 index 62f5f782c1497..0000000000000 --- a/rllib_contrib/mbmpo/tuned_examples/halfcheetah-mbmpo.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -halfcheetah-mbmpo: - env: ray.rllib.examples.env.mbmpo_env.HalfCheetahWrapper - run: MBMPO - stop: - training_iteration: 500 - config: - # Only supported in torch right now. - framework: torch - num_envs_per_worker: 20 - inner_adaptation_steps: 1 - maml_optimizer_steps: 8 - gamma: 0.99 - lambda: 1.0 - lr: 0.001 - clip_param: 0.5 - kl_target: 0.003 - kl_coeff: 0.0000000001 - num_workers: 20 - num_gpus: 1 - inner_lr: 0.001 - clip_actions: False - num_maml_steps: 15 - model: - fcnet_hiddens: [32, 32] - free_log_std: True diff --git a/rllib_contrib/mbmpo/tuned_examples/hopper-mbmpo.yaml b/rllib_contrib/mbmpo/tuned_examples/hopper-mbmpo.yaml deleted file mode 100644 index c7aedccb87391..0000000000000 --- a/rllib_contrib/mbmpo/tuned_examples/hopper-mbmpo.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -hopper-mbmpo: - env: ray.rllib.examples.env.mbmpo_env.HopperWrapper - run: MBMPO - stop: - training_iteration: 500 - config: - # Only supported in torch right now. - framework: torch - num_envs_per_worker: 20 - inner_adaptation_steps: 1 - maml_optimizer_steps: 8 - gamma: 0.99 - lambda: 1.0 - lr: 0.001 - clip_param: 0.5 - kl_target: 0.003 - kl_coeff: 0.0000000001 - num_workers: 20 - num_gpus: 1 - inner_lr: 0.001 - clip_actions: False - num_maml_steps: 15 - model: - fcnet_hiddens: [32, 32] - free_log_std: True diff --git a/rllib_contrib/mbmpo/tuned_examples/pendulum-mbmpo.yaml b/rllib_contrib/mbmpo/tuned_examples/pendulum-mbmpo.yaml deleted file mode 100644 index db45b295a9dc1..0000000000000 --- a/rllib_contrib/mbmpo/tuned_examples/pendulum-mbmpo.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pendulum-mbmpo: - env: ray.rllib.examples.env.mbmpo_env.PendulumWrapper - run: MBMPO - stop: - sampler_results/episode_reward_mean: -500 - training_iteration: 50 - config: - # Only supported in torch right now. - framework: torch - num_envs_per_worker: 20 - inner_adaptation_steps: 1 - maml_optimizer_steps: 8 - gamma: 0.99 - lambda: 1.0 - lr: 0.001 - clip_param: 0.5 - kl_target: 0.003 - kl_coeff: 0.0000000001 - num_workers: 10 - num_gpus: 0 - inner_lr: 0.001 - clip_actions: False - num_maml_steps: 15 - model: - fcnet_hiddens: [32, 32] - free_log_std: True diff --git a/rllib_contrib/pg/BUILD b/rllib_contrib/pg/BUILD deleted file mode 100644 index 2493054f1d049..0000000000000 --- a/rllib_contrib/pg/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# Examples - -py_test( - name = "example_pg_cartpole_v1", - main = "pg_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/pg_cartpole_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_cartpole_pg", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-pg.yaml"], - args = ["--dir=pg/tuned_examples"] -) - -# Compilation Tests - -py_test( - name = "test_pg", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_pg.py"] -) diff --git a/rllib_contrib/pg/README.md b/rllib_contrib/pg/README.md deleted file mode 100644 index 6c49d5a753334..0000000000000 --- a/rllib_contrib/pg/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# PG (Vanilla Policy Gradient) - -[PG](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) is the most basic reinforcement learning algorithm that learns a policy by taking a gradient of action log probabilities and -weighting them by the return. This algorithm is also known as REINFORCE. - - -## Installation - -``` -conda create -n rllib-pg python=3.10 -conda activate rllib-pg -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[PG Example]() \ No newline at end of file diff --git a/rllib_contrib/pg/examples/pg_cartpole_v1.py b/rllib_contrib/pg/examples/pg_cartpole_v1.py deleted file mode 100644 index 1ddbf204667d5..0000000000000 --- a/rllib_contrib/pg/examples/pg_cartpole_v1.py +++ /dev/null @@ -1,47 +0,0 @@ -import argparse - -from rllib_pg.pg import PG, PGConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - PGConfig() - .rollouts(num_rollout_workers=0) - .framework("torch") - .environment("CartPole-v1") - ) - - stop_reward = 150 - - tuner = tune.Tuner( - PG, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 100000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/pg/pyproject.toml b/rllib_contrib/pg/pyproject.toml deleted file mode 100644 index 535be2c8f4254..0000000000000 --- a/rllib_contrib/pg/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-pg" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gym[accept-rom-license]", "gymnasium[accept-rom-license, atari]==0.26.3", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/pg/requirements.txt b/rllib_contrib/pg/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/pg/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/pg/src/rllib_pg/pg/__init__.py b/rllib_contrib/pg/src/rllib_pg/pg/__init__.py deleted file mode 100644 index 8e12f68ae0547..0000000000000 --- a/rllib_contrib/pg/src/rllib_pg/pg/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from rllib_pg.pg.pg import PG, PGConfig -from rllib_pg.pg.pg_tf_policy import PGTF1Policy, PGTF2Policy -from rllib_pg.pg.pg_torch_policy import PGTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = ["PGConfig", "PG", "PGTF1Policy", "PGTF2Policy", "PGTorchPolicy"] - -register_trainable("rllib-contrib-pg", PG) diff --git a/rllib_contrib/pg/src/rllib_pg/pg/pg.py b/rllib_contrib/pg/src/rllib_pg/pg/pg.py deleted file mode 100644 index f621b7fa4bc60..0000000000000 --- a/rllib_contrib/pg/src/rllib_pg/pg/pg.py +++ /dev/null @@ -1,142 +0,0 @@ -from typing import List, Optional, Type, Union - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override - - -class PGConfig(AlgorithmConfig): - """Defines a configuration class from which a PG Algorithm can be built. - - Example: - >>> from rllib_pg.pg import PGConfig - >>> config = PGConfig().training(lr=0.01).resources(num_gpus=1) - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from rllib_pg.pg import PGConfig - >>> from ray import air - >>> from ray import tune - >>> config = PGConfig() - >>> # Print out some default values. - >>> print(config.lr) # doctest: +SKIP - 0.0004 - >>> # Update the config object. - >>> config = config.training(lr=tune.grid_search([0.001, 0.0001])) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "PG", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a PGConfig instance.""" - super().__init__(algo_class=algo_class or PG) - - # fmt: off - # __sphinx_doc_begin__ - # Override some of AlgorithmConfig's default values with PG-specific values. - self.lr_schedule = None - self.lr = 0.0004 - self.rollout_fragment_length = "auto" - self.train_batch_size = 200 - self._disable_preprocessor_api = True - self.exploration_config = { - # The Exploration class to use. In the simplest case, this is the name - # (str) of any class present in the `rllib.utils.exploration` package. - # You can also provide the python class directly or the full location - # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. - # EpsilonGreedy"). - "type": "StochasticSampling", - # Add constructor kwargs here (if any). - } - # __sphinx_doc_end__ - # fmt: on - - @override(AlgorithmConfig) - def training( - self, - *, - lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, - **kwargs, - ) -> "PGConfig": - """Sets the training related configuration. - - Args: - gamma: Float specifying the discount factor of the Markov Decision process. - lr: The default learning rate. - train_batch_size: Training batch size, if applicable. - model: Arguments passed into the policy model. See models/catalog.py for a - full list of the available model options. - optimizer: Arguments to pass to the policy optimizer. - lr_schedule: Learning rate schedule. In the format of - [[timestep, lr-value], [timestep, lr-value], ...] - Intermediary timesteps will be assigned to interpolated learning rate - values. A schedule should normally start from timestep 0. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - if lr_schedule is not NotProvided: - self.lr_schedule = lr_schedule - - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - # Synchronous sampling, on-policy PG algo -> Check mismatches between - # `rollout_fragment_length` and `train_batch_size` to avoid user confusion. - self.validate_train_batch_size_vs_rollout_fragment_length() - - -class PG(Algorithm): - """Policy Gradient (PG) Trainer. - - Defines the distributed Trainer class for policy gradients. - See `pg_[tf|torch]_policy.py` for the definition of the policy losses for - TensorFlow and PyTorch. - - Detailed documentation: - https://docs.ray.io/en/master/rllib-algorithms.html#pg - - Only overrides the default config- and policy selectors - (`get_default_policy_class` and `get_default_config`). Utilizes - the default `training_step()` method of `Algorithm`. - """ - - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return PGConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - from rllib_pg.pg.pg_torch_policy import PGTorchPolicy - - return PGTorchPolicy - elif config.framework_str == "tf": - from rllib_pg.pg.pg_tf_policy import PGTF1Policy - - return PGTF1Policy - else: - from rllib_pg.pg.pg_tf_policy import PGTF2Policy - - return PGTF2Policy diff --git a/rllib_contrib/pg/src/rllib_pg/pg/pg_tf_policy.py b/rllib_contrib/pg/src/rllib_pg/pg/pg_tf_policy.py deleted file mode 100644 index 7a1cfb5434301..0000000000000 --- a/rllib_contrib/pg/src/rllib_pg/pg/pg_tf_policy.py +++ /dev/null @@ -1,159 +0,0 @@ -""" -TensorFlow policy class used for PG. -""" - -import logging -from typing import Dict, List, Optional, Tuple, Type, Union - -from rllib_pg.pg.pg import PGConfig -from rllib_pg.pg.utils import post_process_advantages - -from ray.rllib.evaluation.episode import Episode -from ray.rllib.evaluation.postprocessing import Postprocessing -from ray.rllib.models.action_dist import ActionDistribution -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.policy import Policy -from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import LearningRateSchedule -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.typing import AgentID, TensorType, TFPolicyV2Type - -tf1, tf, tfv = try_import_tf() -logger = logging.getLogger(__name__) - - -# We need this builder function because we want to share the same -# custom logics between TF1 dynamic and TF2 eager policies. -def get_pg_tf_policy(name: str, base: TFPolicyV2Type) -> TFPolicyV2Type: - """Construct a PGTFPolicy inheriting either dynamic or eager base policies. - - Args: - base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. - - Returns: - A TF Policy to be used with PGTrainer. - """ - - class PGTFPolicy( - LearningRateSchedule, - base, - ): - def __init__( - self, - observation_space, - action_space, - config: PGConfig, - existing_model=None, - existing_inputs=None, - ): - # First thing first, enable eager execution if necessary. - base.enable_eager_execution_if_necessary() - - # Enforce AlgorithmConfig for PG Policies. - if isinstance(config, dict): - config = PGConfig.from_dict(config) - - # Initialize base class. - base.__init__( - self, - observation_space, - action_space, - config, - existing_inputs=existing_inputs, - existing_model=existing_model, - ) - - LearningRateSchedule.__init__(self, config.lr, config.lr_schedule) - - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() - - @override(base) - def loss( - self, - model: ModelV2, - dist_class: Type[ActionDistribution], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - """The basic policy gradients loss function. - - Calculates the vanilla policy gradient loss based on: - L = -E[ log(pi(a|s)) * A] - - Args: - model: The Model to calculate the loss for. - dist_class: The action distr. class. - train_batch: The training data. - - Returns: - Union[TensorType, List[TensorType]]: A single loss tensor or a list - of loss tensors. - """ - # Pass the training data through our model to get distribution parameters. - dist_inputs, _ = model(train_batch) - - # Create an action distribution object. - action_dist = dist_class(dist_inputs, model) - - # Calculate the vanilla PG loss based on: - # L = -E[ log(pi(a|s)) * A] - loss = -tf.reduce_mean( - action_dist.logp(train_batch[SampleBatch.ACTIONS]) - * tf.cast(train_batch[Postprocessing.ADVANTAGES], dtype=tf.float32) - ) - - self.policy_loss = loss - - return loss - - @override(base) - def postprocess_trajectory( - self, - sample_batch: SampleBatch, - other_agent_batches: Optional[ - Dict[AgentID, Tuple["Policy", SampleBatch]] - ] = None, - episode: Optional["Episode"] = None, - ) -> SampleBatch: - sample_batch = super().postprocess_trajectory( - sample_batch, other_agent_batches, episode - ) - return post_process_advantages( - self, sample_batch, other_agent_batches, episode - ) - - @override(base) - def extra_learn_fetches_fn(self) -> Dict[str, TensorType]: - return { - "learner_stats": {"cur_lr": self.cur_lr}, - } - - @override(base) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - """Returns the calculated loss and learning rate in a stats dict. - - Args: - policy: The Policy object. - train_batch: The data used for training. - - Returns: - Dict[str, TensorType]: The stats dict. - """ - - return { - "policy_loss": self.policy_loss, - "cur_lr": self.cur_lr, - } - - PGTFPolicy.__name__ = name - PGTFPolicy.__qualname__ = name - - return PGTFPolicy - - -PGTF1Policy = get_pg_tf_policy("PGTF1Policy", DynamicTFPolicyV2) -PGTF2Policy = get_pg_tf_policy("PGTF2Policy", EagerTFPolicyV2) diff --git a/rllib_contrib/pg/src/rllib_pg/pg/pg_torch_policy.py b/rllib_contrib/pg/src/rllib_pg/pg/pg_torch_policy.py deleted file mode 100644 index a4644467bbc35..0000000000000 --- a/rllib_contrib/pg/src/rllib_pg/pg/pg_torch_policy.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -PyTorch policy class used for PG. -""" -import logging -from typing import Dict, List, Optional, Tuple, Type, Union - -from rllib_pg.pg.pg import PGConfig -from rllib_pg.pg.utils import post_process_advantages - -from ray.rllib.evaluation.episode import Episode -from ray.rllib.evaluation.postprocessing import Postprocessing -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper -from ray.rllib.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import LearningRateSchedule -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.typing import AgentID, TensorType - -torch, nn = try_import_torch() - -logger = logging.getLogger(__name__) - - -class PGTorchPolicy(LearningRateSchedule, TorchPolicyV2): - """PyTorch policy class used with PGTrainer.""" - - def __init__(self, observation_space, action_space, config: PGConfig): - - # Enforce AlgorithmConfig for PG Policies. - if isinstance(config, dict): - config = PGConfig.from_dict(config) - - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config.model["max_seq_len"], - ) - - LearningRateSchedule.__init__(self, config.lr, config.lr_schedule) - - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() - - @override(TorchPolicyV2) - def loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - """The basic policy gradients loss function. - - Calculates the vanilla policy gradient loss based on: - L = -E[ log(pi(a|s)) * A] - - Args: - model: The Model to calculate the loss for. - dist_class: The action distr. class. - train_batch: The training data. - - Returns: - Union[TensorType, List[TensorType]]: A single loss tensor or a list - of loss tensors. - """ - # Pass the training data through our model to get distribution parameters. - dist_inputs, _ = model(train_batch) - - # Create an action distribution object. - action_dist = dist_class(dist_inputs, model) - - # Calculate the vanilla PG loss based on: - # L = -E[ log(pi(a|s)) * A] - log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) - - # Final policy loss. - policy_loss = -torch.mean(log_probs * train_batch[Postprocessing.ADVANTAGES]) - - # Store values for stats function in model (tower), such that for - # multi-GPU, we do not override them during the parallel loss phase. - model.tower_stats["policy_loss"] = policy_loss - - return policy_loss - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - """Returns the calculated loss in a stats dict. - - Args: - policy: The Policy object. - train_batch: The data used for training. - - Returns: - Dict[str, TensorType]: The stats dict. - """ - - return convert_to_numpy( - { - "policy_loss": torch.mean( - torch.stack(self.get_tower_stats("policy_loss")) - ), - "cur_lr": self.cur_lr, - } - ) - - @override(TorchPolicyV2) - def postprocess_trajectory( - self, - sample_batch: SampleBatch, - other_agent_batches: Optional[ - Dict[AgentID, Tuple["Policy", SampleBatch]] - ] = None, - episode: Optional["Episode"] = None, - ) -> SampleBatch: - sample_batch = super().postprocess_trajectory( - sample_batch, other_agent_batches, episode - ) - return post_process_advantages(self, sample_batch, other_agent_batches, episode) diff --git a/rllib_contrib/pg/src/rllib_pg/pg/utils.py b/rllib_contrib/pg/src/rllib_pg/pg/utils.py deleted file mode 100644 index be883d85134de..0000000000000 --- a/rllib_contrib/pg/src/rllib_pg/pg/utils.py +++ /dev/null @@ -1,38 +0,0 @@ -from typing import List, Optional - -from ray.rllib.evaluation.episode import Episode -from ray.rllib.evaluation.postprocessing import compute_advantages -from ray.rllib.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch - - -def post_process_advantages( - policy: Policy, - sample_batch: SampleBatch, - other_agent_batches: Optional[List[SampleBatch]] = None, - episode: Optional[Episode] = None, -) -> SampleBatch: - """Adds the "advantages" column to `sample_batch`. - - Args: - policy: The Policy object to do post-processing for. - sample_batch: The actual sample batch to post-process. - other_agent_batches (Optional[List[SampleBatch]]): Optional list of - other agents' SampleBatch objects. - episode: The multi-agent episode object, from which - `sample_batch` was generated. - - Returns: - SampleBatch: The SampleBatch enhanced by the added ADVANTAGES field. - """ - - # Calculates advantage values based on the rewards in the sample batch. - # The value of the last observation is assumed to be 0.0 (no value function - # estimation at the end of the sampled chunk). - return compute_advantages( - rollout=sample_batch, - last_r=0.0, - gamma=policy.config["gamma"], - use_gae=False, - use_critic=False, - ) diff --git a/rllib_contrib/pg/tests/test_pg.py b/rllib_contrib/pg/tests/test_pg.py deleted file mode 100644 index 1cf8cb26656c3..0000000000000 --- a/rllib_contrib/pg/tests/test_pg.py +++ /dev/null @@ -1,242 +0,0 @@ -import unittest - -import numpy as np -import rllib_pg.pg.pg as pg -from gymnasium.spaces import Box, Dict, Discrete, Tuple -from rllib_pg.pg.utils import post_process_advantages - -import ray -from ray import tune -from ray.rllib.evaluation.postprocessing import Postprocessing -from ray.rllib.examples.env.random_env import RandomEnv -from ray.rllib.models.tf.tf_action_dist import Categorical -from ray.rllib.models.torch.torch_action_dist import TorchCategorical -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.metrics.learner_info import ( - DEFAULT_POLICY_ID, - LEARNER_INFO, - LEARNER_STATS_KEY, -) -from ray.rllib.utils.numpy import fc -from ray.rllib.utils.test_utils import ( - check, - check_compute_single_action, - check_train_results, - framework_iterator, -) - - -class TestPG(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_pg_compilation(self): - """Test whether PG can be built with all frameworks.""" - config = pg.PGConfig() - - # Test with filter to see whether they work w/o preprocessing. - config.rollouts( - num_rollout_workers=1, - observation_filter="MeanStdFilter", - ).training(train_batch_size=500) - num_iterations = 1 - - image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) - simple_space = Box(-1.0, 1.0, shape=(3,)) - - tune.register_env( - "random_dict_env", - lambda _: RandomEnv( - { - "observation_space": Dict( - { - "a": simple_space, - "b": Discrete(2), - "c": image_space, - } - ), - "action_space": Box(-1.0, 1.0, shape=(1,)), - } - ), - ) - tune.register_env( - "random_tuple_env", - lambda _: RandomEnv( - { - "observation_space": Tuple( - [simple_space, Discrete(2), image_space] - ), - "action_space": Box(-1.0, 1.0, shape=(1,)), - } - ), - ) - - for _ in framework_iterator(config, with_eager_tracing=True): - # Test for different env types (discrete w/ and w/o image, + cont). - for env in [ - "random_dict_env", - "random_tuple_env", - "ALE/MsPacman-v5", - "CartPole-v1", - "FrozenLake-v1", - ]: - print(f"env={env}") - config.environment(env) - - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - - check_compute_single_action(algo, include_prev_action_reward=True) - - def test_pg_loss_functions(self): - """Tests the PG loss function math.""" - config = ( - pg.PGConfig() - .rollouts(num_rollout_workers=0) - .training( - gamma=0.99, - model={ - "fcnet_hiddens": [10], - "fcnet_activation": "linear", - }, - ) - ) - - # Fake CartPole episode of n time steps. - train_batch = SampleBatch( - { - SampleBatch.OBS: np.array( - [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]] - ), - SampleBatch.ACTIONS: np.array([0, 1, 1]), - SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), - SampleBatch.TERMINATEDS: np.array([False, False, True]), - SampleBatch.EPS_ID: np.array([1234, 1234, 1234]), - SampleBatch.AGENT_INDEX: np.array([0, 0, 0]), - } - ) - - for fw, sess in framework_iterator(config, session=True): - dist_cls = Categorical if fw != "torch" else TorchCategorical - algo = config.build(env="CartPole-v1") - policy = algo.get_policy() - vars = policy.model.trainable_variables() - if sess: - vars = policy.get_session().run(vars) - - # Post-process (calculate simple (non-GAE) advantages) and attach - # to train_batch dict. - # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = - # [2.9701, 1.99, 1.0] - train_batch_ = post_process_advantages(policy, train_batch.copy()) - if fw == "torch": - train_batch_ = policy._lazy_tensor_dict(train_batch_) - - # Check Advantage values. - check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) - - # Actual loss results. - if sess: - results = policy.get_session().run( - policy._loss, - feed_dict=policy._get_loss_inputs_dict(train_batch_, shuffle=False), - ) - else: - - results = policy.loss( - policy.model, dist_class=dist_cls, train_batch=train_batch_ - ) - - # Calculate expected results. - if fw != "torch": - expected_logits = fc( - fc(train_batch_[SampleBatch.OBS], vars[0], vars[1], framework=fw), - vars[2], - vars[3], - framework=fw, - ) - else: - expected_logits = fc( - fc(train_batch_[SampleBatch.OBS], vars[2], vars[3], framework=fw), - vars[0], - vars[1], - framework=fw, - ) - expected_logp = dist_cls(expected_logits, policy.model).logp( - train_batch_[SampleBatch.ACTIONS] - ) - adv = train_batch_[Postprocessing.ADVANTAGES] - if sess: - expected_logp = sess.run(expected_logp) - elif fw == "torch": - expected_logp = expected_logp.detach().cpu().numpy() - adv = adv.detach().cpu().numpy() - else: - expected_logp = expected_logp.numpy() - expected_loss = -np.mean(expected_logp * adv) - check(results, expected_loss, decimals=4) - - def test_pg_lr(self): - """Test PG with learning rate schedule.""" - config = pg.PGConfig() - config.reporting( - min_sample_timesteps_per_iteration=10, - # Make sure that results contain info on default policy - min_train_timesteps_per_iteration=10, - # 0 metrics reporting delay, this makes sure timestep, - # which lr depends on, is updated after each worker rollout. - min_time_s_per_iteration=0, - ) - config.rollouts( - num_rollout_workers=1, - ) - config.training( - lr=0.2, - lr_schedule=[[0, 0.2], [500, 0.001]], - train_batch_size=50, - ) - - def _step_n_times(algo, n: int): - """Step trainer n times. - - Returns: - learning rate at the end of the execution. - """ - for _ in range(n): - results = algo.train() - return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ - "cur_lr" - ] - - for _ in framework_iterator(config): - algo = config.build(env="CartPole-v1") - - lr = _step_n_times(algo, 1) # 50 timesteps - # Close to 0.2 - self.assertGreaterEqual(lr, 0.15) - - lr = _step_n_times(algo, 8) # Close to 500 timesteps - # LR Annealed to 0.001 - self.assertLessEqual(float(lr), 0.5) - - lr = _step_n_times(algo, 2) # > 500 timesteps - # LR == 0.001 - self.assertAlmostEqual(lr, 0.001) - - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/pg/tuned_examples/__init__.py b/rllib_contrib/pg/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/pg/tuned_examples/cartpole-pg-fake-gpus.yaml b/rllib_contrib/pg/tuned_examples/cartpole-pg-fake-gpus.yaml deleted file mode 100644 index a7580d8406e30..0000000000000 --- a/rllib_contrib/pg/tuned_examples/cartpole-pg-fake-gpus.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-pg-fake-gpus: - env: CartPole-v1 - run: PG - stop: - sampler_results/episode_reward_mean: 150 - training_iteration: 600 - config: - # Works for both torch and tf. - framework: torch - num_workers: 0 - - model: - fcnet_hiddens: [64] - fcnet_activation: linear - - # Fake 2 GPUs. - num_gpus: 2 - _fake_gpus: true diff --git a/rllib_contrib/pg/tuned_examples/cartpole-pg.yaml b/rllib_contrib/pg/tuned_examples/cartpole-pg.yaml deleted file mode 100644 index 20156a1844f2e..0000000000000 --- a/rllib_contrib/pg/tuned_examples/cartpole-pg.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-pg: - env: CartPole-v1 - run: PG - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 100000 - config: - # Works for both torch and tf. - framework: torch - num_workers: 0 diff --git a/rllib_contrib/pg/tuned_examples/frozenlake-pg.yaml b/rllib_contrib/pg/tuned_examples/frozenlake-pg.yaml deleted file mode 100644 index fc7f4d3a1e753..0000000000000 --- a/rllib_contrib/pg/tuned_examples/frozenlake-pg.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -frozenlake-pg: - env: FrozenLake-v1 - run: PG - stop: - sampler_results/episode_reward_mean: 0.99 - timesteps_total: 1000000 - config: - # Works for both torch and tf. - framework: tf - - # Sparse reward environment (short horizon). - env_config: - map_name: 4x4 - #- "SFFFFFFF" - #- "FFFFFFFF" - #- "FFFFFFFF" - #- "FFFFFFFF" - #- "FFFFFFFF" - #- "FFFFFFFF" - #- "FFFFFFFF" - #- "FFFFFFFG" - is_slippery: false - #max_episode_steps: 16 diff --git a/rllib_contrib/qmix/BUILD b/rllib_contrib/qmix/BUILD deleted file mode 100644 index 4b39d6ba461f2..0000000000000 --- a/rllib_contrib/qmix/BUILD +++ /dev/null @@ -1,51 +0,0 @@ -# Examples - -py_test( - name = "example_qmix_two_step_game", - main = "qmix_two_step_game.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/qmix_two_step_game.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_two_step_game_qmix", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/two-step-game-qmix.yaml"], - args = ["--dir=qmix/tuned_examples/"] -) - -py_test( - name = "learning_tests_two_step_game_qmix_vdn_mixer", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/two-step-game-qmix-vdn-mixer.yaml"], - args = ["--dir=qmix/tuned_examples/"] -) - -py_test( - name = "learning_tests_two_step_game_qmix_no_mixer", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "torch_only"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/two-step-game-qmix-no-mixer.yaml"], - args = ["--dir=qmix/tuned_examples/"] -) - -# Compilation Tests - -py_test( - name = "test_qmix", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_qmix.py"] -) diff --git a/rllib_contrib/qmix/README.md b/rllib_contrib/qmix/README.md deleted file mode 100644 index 6456f0d5f0e20..0000000000000 --- a/rllib_contrib/qmix/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# QMIX (Monotonic Value Function Factorisation for Multi-Agent RL) - -[QMIX](https://arxiv.org/abs/1803.11485) Q-Mix is a specialized multi-agent algorithm. Code here is adapted from https://github.com/oxwhirl/pymarl_alpha to integrate with RLlib multi-agent APIs. To use Q-Mix, you must specify an agent grouping in the environment (see the two-step game example). Currently, all agents in the group must be homogeneous. The algorithm can be scaled by increasing the number of workers or using Ape-X. - - -## Installation - -``` -conda create -n rllib-qmix python=3.10 -conda activate rllib-qmix -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[QMIX Example]() \ No newline at end of file diff --git a/rllib_contrib/qmix/examples/qmix_two_step_game.py b/rllib_contrib/qmix/examples/qmix_two_step_game.py deleted file mode 100644 index 249b883e433c3..0000000000000 --- a/rllib_contrib/qmix/examples/qmix_two_step_game.py +++ /dev/null @@ -1,116 +0,0 @@ -# The two-step game from QMIX: https://arxiv.org/pdf/1803.11485.pdf - -import argparse -import logging - -from gymnasium.spaces import Dict, MultiDiscrete, Tuple -from rllib_qmix.qmix import QMix, QMixConfig - -import ray -from ray import air, tune -from ray.rllib.env.multi_agent_env import ENV_STATE -from ray.rllib.examples.env.two_step_game import TwoStepGame -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune import register_env - -logger = logging.getLogger(__name__) - -parser = argparse.ArgumentParser() -parser.add_argument( - "--mixer", - type=str, - default="qmix", - choices=["qmix", "vdn", "none"], - help="The mixer model to use.", -) -parser.add_argument( - "--run-as-test", - action="store_true", -) - -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=8.0, help="Reward at which we stop training." -) - - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init() - - grouping = { - "group_1": [0, 1], - } - obs_space = Tuple( - [ - Dict( - { - "obs": MultiDiscrete([2, 2, 2, 3]), - ENV_STATE: MultiDiscrete([2, 2, 2]), - } - ), - Dict( - { - "obs": MultiDiscrete([2, 2, 2, 3]), - ENV_STATE: MultiDiscrete([2, 2, 2]), - } - ), - ] - ) - act_space = Tuple( - [ - TwoStepGame.action_space, - TwoStepGame.action_space, - ] - ) - register_env( - "grouped_twostep", - lambda config: TwoStepGame(config).with_agent_groups( - grouping, obs_space=obs_space, act_space=act_space - ), - ) - - config = ( - QMixConfig() - .environment(TwoStepGame) - .framework("torch") - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources() - ) - - ( - config.framework("torch") - .training(mixer=args.mixer, train_batch_size=32) - .rollouts(num_rollout_workers=0, rollout_fragment_length=4) - .exploration( - exploration_config={ - "final_epsilon": 0.0, - } - ) - .environment( - env="grouped_twostep", - env_config={ - "separate_state_space": True, - "one_hot_state_encoding": True, - }, - ) - ) - - stop = { - "episode_reward_mean": args.stop_reward, - "timesteps_total": args.stop_timesteps, - } - - results = tune.Tuner( - QMix, - run_config=air.RunConfig(stop=stop), - param_space=config, - ).fit() - - if args.run_as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() diff --git a/rllib_contrib/qmix/pyproject.toml b/rllib_contrib/qmix/pyproject.toml deleted file mode 100644 index 00ef4d68f57fa..0000000000000 --- a/rllib_contrib/qmix/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-qmix" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium==0.26.3", "ray[rllib]==2.5.1"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "torch==1.12.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "numpy<2"] diff --git a/rllib_contrib/qmix/requirements.txt b/rllib_contrib/qmix/requirements.txt deleted file mode 100644 index b07006a1b4ec6..0000000000000 --- a/rllib_contrib/qmix/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/qmix/src/rllib_qmix/qmix/__init__.py b/rllib_contrib/qmix/src/rllib_qmix/qmix/__init__.py deleted file mode 100644 index 66eff9c905192..0000000000000 --- a/rllib_contrib/qmix/src/rllib_qmix/qmix/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from rllib_qmix.qmix.qmix import QMix, QMixConfig -from rllib_qmix.qmix.qmix_policy import QMixTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = ["QMixConfig", "QMix", "QMixTorchPolicy"] - -register_trainable("rllib-contrib-qmix", QMix) diff --git a/rllib_contrib/qmix/src/rllib_qmix/qmix/mixers.py b/rllib_contrib/qmix/src/rllib_qmix/qmix/mixers.py deleted file mode 100644 index 4bc6328b800ae..0000000000000 --- a/rllib_contrib/qmix/src/rllib_qmix/qmix/mixers.py +++ /dev/null @@ -1,62 +0,0 @@ -import numpy as np - -from ray.rllib.utils.framework import try_import_torch - -torch, nn = try_import_torch() - - -class VDNMixer(nn.Module): - def __init__(self): - super(VDNMixer, self).__init__() - - def forward(self, agent_qs, batch): - return torch.sum(agent_qs, dim=2, keepdim=True) - - -class QMixer(nn.Module): - def __init__(self, n_agents, state_shape, mixing_embed_dim): - super(QMixer, self).__init__() - - self.n_agents = n_agents - self.embed_dim = mixing_embed_dim - self.state_dim = int(np.prod(state_shape)) - - self.hyper_w_1 = nn.Linear(self.state_dim, self.embed_dim * self.n_agents) - self.hyper_w_final = nn.Linear(self.state_dim, self.embed_dim) - - # State dependent bias for hidden layer - self.hyper_b_1 = nn.Linear(self.state_dim, self.embed_dim) - - # V(s) instead of a bias for the last layers - self.V = nn.Sequential( - nn.Linear(self.state_dim, self.embed_dim), - nn.ReLU(), - nn.Linear(self.embed_dim, 1), - ) - - def forward(self, agent_qs, states): - """Forward pass for the mixer. - - Args: - agent_qs: Tensor of shape [B, T, n_agents, n_actions] - states: Tensor of shape [B, T, state_dim] - """ - bs = agent_qs.size(0) - states = states.reshape(-1, self.state_dim) - agent_qs = agent_qs.view(-1, 1, self.n_agents) - # First layer - w1 = torch.abs(self.hyper_w_1(states)) - b1 = self.hyper_b_1(states) - w1 = w1.view(-1, self.n_agents, self.embed_dim) - b1 = b1.view(-1, 1, self.embed_dim) - hidden = nn.functional.elu(torch.bmm(agent_qs, w1) + b1) - # Second layer - w_final = torch.abs(self.hyper_w_final(states)) - w_final = w_final.view(-1, self.embed_dim, 1) - # State-dependent bias - v = self.V(states).view(-1, 1, 1) - # Compute final output - y = torch.bmm(hidden, w_final) + v - # Reshape and return - q_tot = y.view(bs, -1, 1) - return q_tot diff --git a/rllib_contrib/qmix/src/rllib_qmix/qmix/model.py b/rllib_contrib/qmix/src/rllib_qmix/qmix/model.py deleted file mode 100644 index c39080db8af98..0000000000000 --- a/rllib_contrib/qmix/src/rllib_qmix/qmix/model.py +++ /dev/null @@ -1,42 +0,0 @@ -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.preprocessors import get_preprocessor -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch - -torch, nn = try_import_torch() - - -class RNNModel(TorchModelV2, nn.Module): - """The default RNN model for QMIX.""" - - def __init__(self, obs_space, action_space, num_outputs, model_config, name): - TorchModelV2.__init__( - self, obs_space, action_space, num_outputs, model_config, name - ) - nn.Module.__init__(self) - self.obs_size = _get_size(obs_space) - self.rnn_hidden_dim = model_config["lstm_cell_size"] - self.fc1 = nn.Linear(self.obs_size, self.rnn_hidden_dim) - self.rnn = nn.GRUCell(self.rnn_hidden_dim, self.rnn_hidden_dim) - self.fc2 = nn.Linear(self.rnn_hidden_dim, num_outputs) - self.n_agents = model_config["n_agents"] - - @override(ModelV2) - def get_initial_state(self): - # Place hidden states on same device as model. - return [ - self.fc1.weight.new(self.n_agents, self.rnn_hidden_dim).zero_().squeeze(0) - ] - - @override(ModelV2) - def forward(self, input_dict, hidden_state, seq_lens): - x = nn.functional.relu(self.fc1(input_dict["obs_flat"].float())) - h_in = hidden_state[0].reshape(-1, self.rnn_hidden_dim) - h = self.rnn(x, h_in) - q = self.fc2(h) - return q, [h] - - -def _get_size(obs_space): - return get_preprocessor(obs_space)(obs_space).size diff --git a/rllib_contrib/qmix/src/rllib_qmix/qmix/qmix.py b/rllib_contrib/qmix/src/rllib_qmix/qmix/qmix.py deleted file mode 100644 index 05af2302a45d9..0000000000000 --- a/rllib_contrib/qmix/src/rllib_qmix/qmix/qmix.py +++ /dev/null @@ -1,324 +0,0 @@ -from typing import Optional, Type - -from rllib_qmix.qmix.qmix_policy import QMixTorchPolicy - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.simple_q.simple_q import SimpleQ, SimpleQConfig -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample -from ray.rllib.execution.train_ops import multi_gpu_train_one_step, train_one_step -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning -from ray.rllib.utils.metrics import ( - LAST_TARGET_UPDATE_TS, - NUM_AGENT_STEPS_SAMPLED, - NUM_ENV_STEPS_SAMPLED, - NUM_TARGET_UPDATES, - SAMPLE_TIMER, - SYNCH_WORKER_WEIGHTS_TIMER, -) -from ray.rllib.utils.replay_buffers.utils import ( - sample_min_n_steps_from_buffer, - update_priorities_in_replay_buffer, -) -from ray.rllib.utils.typing import ResultDict - - -class QMixConfig(SimpleQConfig): - """Defines a configuration class from which QMix can be built. - - Example: - >>> from ray.rllib.examples.env.two_step_game import TwoStepGame - >>> from rllib_qmix.qmix import QMixConfig - >>> config = QMixConfig() # doctest: +SKIP - >>> config = config.training(gamma=0.9, lr=0.01, kl_coeff=0.3) # doctest: +SKIP - >>> config = config.resources(num_gpus=0) # doctest: +SKIP - >>> config = config.rollouts(num_rollout_workers=4) # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build an Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env=TwoStepGame) # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.examples.env.two_step_game import TwoStepGame - >>> from rllib_qmix.qmix import QMixConfig - >>> from ray import air - >>> from ray import tune - >>> config = QMixConfig() - >>> # Print out some default values. - >>> print(config.optim_alpha) # doctest: +SKIP - >>> # Update the config object. - >>> config.training( # doctest: +SKIP - ... lr=tune.grid_search([0.001, 0.0001]), optim_alpha=0.97 - ... ) - >>> # Set the config object's env. - >>> config.environment(env=TwoStepGame) # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "QMix", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self): - """Initializes a PPOConfig instance.""" - super().__init__(algo_class=QMix) - - # fmt: off - # __sphinx_doc_begin__ - # QMix specific settings: - self.mixer = "qmix" - self.mixing_embed_dim = 32 - self.double_q = True - self.optim_alpha = 0.99 - self.optim_eps = 0.00001 - - self.grad_clip = 10.0 - # Note: Only when using _enable_learner_api=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by - # global_norm, no matter the value of `grad_clip_by`. - self.grad_clip_by = "global_norm" - - # QMix-torch overrides the TorchPolicy's learn_on_batch w/o specifying a - # alternative `learn_on_loaded_batch` alternative for the GPU. - # TODO: This hack will be resolved once we move all algorithms to the new - # RLModule/Learner APIs. - self.simple_optimizer = True - - # Override some of AlgorithmConfig's default values with QMix-specific values. - # .training() - self.lr = 0.0005 - self.train_batch_size = 32 - self.target_network_update_freq = 500 - self.num_steps_sampled_before_learning_starts = 1000 - self.replay_buffer_config = { - "type": "ReplayBuffer", - # Specify prioritized replay by supplying a buffer type that supports - # prioritization, for example: MultiAgentPrioritizedReplayBuffer. - "prioritized_replay": DEPRECATED_VALUE, - # Size of the replay buffer in batches (not timesteps!). - "capacity": 1000, - # Choosing `fragments` here makes it so that the buffer stores entire - # batches, instead of sequences, episodes or timesteps. - "storage_unit": "fragments", - # Whether to compute priorities on workers. - "worker_side_prioritization": False, - } - self.model = { - "lstm_cell_size": 64, - "max_seq_len": 999999, - } - - # .framework() - self.framework_str = "torch" - - # .rollouts() - self.rollout_fragment_length = 4 - self.batch_mode = "complete_episodes" - - # .reporting() - self.min_time_s_per_iteration = 1 - self.min_sample_timesteps_per_iteration = 1000 - - # .exploration() - self.exploration_config = { - # The Exploration class to use. - "type": "EpsilonGreedy", - # Config for the Exploration class' constructor: - "initial_epsilon": 1.0, - "final_epsilon": 0.01, - # Timesteps over which to anneal epsilon. - "epsilon_timesteps": 40000, - - # For soft_q, use: - # "exploration_config" = { - # "type": "SoftQ" - # "temperature": [float, e.g. 1.0] - # } - } - - # .evaluation() - # Evaluate with epsilon=0 every `evaluation_interval` training iterations. - # The evaluation stats will be reported under the "evaluation" metric key. - self.evaluation( - evaluation_config=AlgorithmConfig.overrides(explore=False) - ) - # __sphinx_doc_end__ - # fmt: on - - self.worker_side_prioritization = DEPRECATED_VALUE - - @override(SimpleQConfig) - def training( - self, - *, - mixer: Optional[str] = NotProvided, - mixing_embed_dim: Optional[int] = NotProvided, - double_q: Optional[bool] = NotProvided, - target_network_update_freq: Optional[int] = NotProvided, - replay_buffer_config: Optional[dict] = NotProvided, - optim_alpha: Optional[float] = NotProvided, - optim_eps: Optional[float] = NotProvided, - grad_clip: Optional[float] = NotProvided, - # Deprecated args. - grad_norm_clipping=DEPRECATED_VALUE, - **kwargs, - ) -> "QMixConfig": - """Sets the training related configuration. - - Args: - mixer: Mixing network. Either "qmix", "vdn", or None. - mixing_embed_dim: Size of the mixing network embedding. - double_q: Whether to use Double_Q learning. - target_network_update_freq: Update the target network every - `target_network_update_freq` sample steps. - replay_buffer_config: - optim_alpha: RMSProp alpha. - optim_eps: RMSProp epsilon. - grad_clip: If not None, clip gradients during optimization at - this value. - grad_norm_clipping: Depcrecated in favor of grad_clip - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if grad_norm_clipping != DEPRECATED_VALUE: - deprecation_warning( - old="grad_norm_clipping", - new="grad_clip", - help="Parameter `grad_norm_clipping` has been " - "deprecated in favor of grad_clip in QMix. " - "This is now the same parameter as in other " - "algorithms. `grad_clip` will be overwritten by " - "`grad_norm_clipping={}`".format(grad_norm_clipping), - error=True, - ) - grad_clip = grad_norm_clipping - - if mixer is not NotProvided: - self.mixer = mixer - if mixing_embed_dim is not NotProvided: - self.mixing_embed_dim = mixing_embed_dim - if double_q is not NotProvided: - self.double_q = double_q - if target_network_update_freq is not NotProvided: - self.target_network_update_freq = target_network_update_freq - if replay_buffer_config is not NotProvided: - self.replay_buffer_config = replay_buffer_config - if optim_alpha is not NotProvided: - self.optim_alpha = optim_alpha - if optim_eps is not NotProvided: - self.optim_eps = optim_eps - if grad_clip is not NotProvided: - self.grad_clip = grad_clip - - return self - - @override(SimpleQConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if self.framework_str != "torch": - raise ValueError( - "Only `config.framework('torch')` supported so far for QMix!" - ) - - -class QMix(SimpleQ): - @classmethod - @override(SimpleQ) - def get_default_config(cls) -> AlgorithmConfig: - return QMixConfig() - - @classmethod - @override(SimpleQ) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - return QMixTorchPolicy - - @override(SimpleQ) - def training_step(self) -> ResultDict: - """QMIX training iteration function. - - - Sample n MultiAgentBatches from n workers synchronously. - - Store new samples in the replay buffer. - - Sample one training MultiAgentBatch from the replay buffer. - - Learn on the training batch. - - Update the target network every `target_network_update_freq` sample steps. - - Return all collected training metrics for the iteration. - - Returns: - The results dict from executing the training iteration. - """ - # Sample n batches from n workers. - with self._timers[SAMPLE_TIMER]: - new_sample_batches = synchronous_parallel_sample( - worker_set=self.workers, concat=False - ) - - for batch in new_sample_batches: - # Update counters. - self._counters[NUM_ENV_STEPS_SAMPLED] += batch.env_steps() - self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps() - # Store new samples in the replay buffer. - self.local_replay_buffer.add(batch) - - # Update target network every `target_network_update_freq` sample steps. - cur_ts = self._counters[ - NUM_AGENT_STEPS_SAMPLED - if self.config.count_steps_by == "agent_steps" - else NUM_ENV_STEPS_SAMPLED - ] - - train_results = {} - - if cur_ts > self.config.num_steps_sampled_before_learning_starts: - # Sample n batches from replay buffer until the total number of timesteps - # reaches `train_batch_size`. - train_batch = sample_min_n_steps_from_buffer( - replay_buffer=self.local_replay_buffer, - min_steps=self.config.train_batch_size, - count_by_agent_steps=self.config.count_steps_by == "agent_steps", - ) - - # Learn on the training batch. - # Use simple optimizer (only for multi-agent or tf-eager; all other - # cases should use the multi-GPU optimizer, even if only using 1 GPU) - if self.config.get("simple_optimizer") is True: - train_results = train_one_step(self, train_batch) - else: - train_results = multi_gpu_train_one_step(self, train_batch) - - # Update target network every `target_network_update_freq` sample steps. - last_update = self._counters[LAST_TARGET_UPDATE_TS] - if cur_ts - last_update >= self.config.target_network_update_freq: - to_update = self.workers.local_worker().get_policies_to_train() - self.workers.local_worker().foreach_policy_to_train( - lambda p, pid: pid in to_update and p.update_target() - ) - self._counters[NUM_TARGET_UPDATES] += 1 - self._counters[LAST_TARGET_UPDATE_TS] = cur_ts - - update_priorities_in_replay_buffer( - self.local_replay_buffer, self.config, train_batch, train_results - ) - - # Update weights and global_vars - after learning on the local worker - - # on all remote workers. - global_vars = { - "timestep": self._counters[NUM_ENV_STEPS_SAMPLED], - } - # Update remote workers' weights and global vars after learning on local - # worker. - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - self.workers.sync_weights(global_vars=global_vars) - - # Return all collected metrics for the iteration. - return train_results diff --git a/rllib_contrib/qmix/src/rllib_qmix/qmix/qmix_policy.py b/rllib_contrib/qmix/src/rllib_qmix/qmix/qmix_policy.py deleted file mode 100644 index d8731e84571be..0000000000000 --- a/rllib_contrib/qmix/src/rllib_qmix/qmix/qmix_policy.py +++ /dev/null @@ -1,660 +0,0 @@ -import logging -from typing import Dict, List, Optional, Tuple - -import gymnasium as gym -import numpy as np -import tree # pip install dm_tree -from rllib_qmix.qmix.mixers import QMixer, VDNMixer -from rllib_qmix.qmix.model import RNNModel, _get_size - -from ray.rllib.env.multi_agent_env import ENV_STATE -from ray.rllib.env.wrappers.group_agents_wrapper import GROUP_REWARDS -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.models.modelv2 import _unpack_obs -from ray.rllib.models.torch.torch_action_dist import TorchCategorical -from ray.rllib.policy.rnn_sequencing import chop_into_sequences -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_policy import TorchPolicy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.rllib.utils.torch_utils import apply_grad_clipping -from ray.rllib.utils.typing import TensorType - -# Torch must be installed. -torch, nn = try_import_torch(error=False) - -logger = logging.getLogger(__name__) - - -class QMixLoss(nn.Module): - def __init__( - self, - model, - target_model, - mixer, - target_mixer, - n_agents, - n_actions, - double_q=True, - gamma=0.99, - ): - nn.Module.__init__(self) - self.model = model - self.target_model = target_model - self.mixer = mixer - self.target_mixer = target_mixer - self.n_agents = n_agents - self.n_actions = n_actions - self.double_q = double_q - self.gamma = gamma - - def forward( - self, - rewards, - actions, - terminated, - mask, - obs, - next_obs, - action_mask, - next_action_mask, - state=None, - next_state=None, - ): - """Forward pass of the loss. - - Args: - rewards: Tensor of shape [B, T, n_agents] - actions: Tensor of shape [B, T, n_agents] - terminated: Tensor of shape [B, T, n_agents] - mask: Tensor of shape [B, T, n_agents] - obs: Tensor of shape [B, T, n_agents, obs_size] - next_obs: Tensor of shape [B, T, n_agents, obs_size] - action_mask: Tensor of shape [B, T, n_agents, n_actions] - next_action_mask: Tensor of shape [B, T, n_agents, n_actions] - state: Tensor of shape [B, T, state_dim] (optional) - next_state: Tensor of shape [B, T, state_dim] (optional) - """ - - # Assert either none or both of state and next_state are given - if state is None and next_state is None: - state = obs # default to state being all agents' observations - next_state = next_obs - elif (state is None) != (next_state is None): - raise ValueError( - "Expected either neither or both of `state` and " - "`next_state` to be given. Got: " - "\n`state` = {}\n`next_state` = {}".format(state, next_state) - ) - - # Calculate estimated Q-Values - mac_out = _unroll_mac(self.model, obs) - - # Pick the Q-Values for the actions taken -> [B * n_agents, T] - chosen_action_qvals = torch.gather( - mac_out, dim=3, index=actions.unsqueeze(3) - ).squeeze(3) - - # Calculate the Q-Values necessary for the target - target_mac_out = _unroll_mac(self.target_model, next_obs) - - # Mask out unavailable actions for the t+1 step - ignore_action_tp1 = (next_action_mask == 0) & (mask == 1).unsqueeze(-1) - target_mac_out[ignore_action_tp1] = -np.inf - - # Max over target Q-Values - if self.double_q: - # Double Q learning computes the target Q values by selecting the - # t+1 timestep action according to the "policy" neural network and - # then estimating the Q-value of that action with the "target" - # neural network - - # Compute the t+1 Q-values to be used in action selection - # using next_obs - mac_out_tp1 = _unroll_mac(self.model, next_obs) - - # mask out unallowed actions - mac_out_tp1[ignore_action_tp1] = -np.inf - - # obtain best actions at t+1 according to policy NN - cur_max_actions = mac_out_tp1.argmax(dim=3, keepdim=True) - - # use the target network to estimate the Q-values of policy - # network's selected actions - target_max_qvals = torch.gather(target_mac_out, 3, cur_max_actions).squeeze( - 3 - ) - else: - target_max_qvals = target_mac_out.max(dim=3)[0] - - assert ( - target_max_qvals.min().item() != -np.inf - ), "target_max_qvals contains a masked action; \ - there may be a state with no valid actions." - - # Mix - if self.mixer is not None: - chosen_action_qvals = self.mixer(chosen_action_qvals, state) - target_max_qvals = self.target_mixer(target_max_qvals, next_state) - - # Calculate 1-step Q-Learning targets - targets = rewards + self.gamma * (1 - terminated) * target_max_qvals - - # Td-error - td_error = chosen_action_qvals - targets.detach() - - mask = mask.expand_as(td_error) - - # 0-out the targets that came from padded data - masked_td_error = td_error * mask - - # Normal L2 loss, take mean over actual data - loss = (masked_td_error**2).sum() / mask.sum() - return loss, mask, masked_td_error, chosen_action_qvals, targets - - -class QMixTorchPolicy(TorchPolicy): - """QMix impl. Assumes homogeneous agents for now. - - You must use MultiAgentEnv.with_agent_groups() to group agents - together for QMix. This creates the proper Tuple obs/action spaces and - populates the '_group_rewards' info field. - - Action masking: to specify an action mask for individual agents, use a - dict space with an action_mask key, e.g. {"obs": ob, "action_mask": mask}. - The mask space must be `Box(0, 1, (n_actions,))`. - """ - - def __init__(self, obs_space, action_space, config): - # We want to error out on instantiation and not on import, because tune - # imports all RLlib algorithms when registering them - # TODO (Artur): Find a way to only import algorithms when needed - if not torch: - raise ImportError("Could not import PyTorch, which QMix requires.") - - _validate(obs_space, action_space) - self.framework = "torch" - - self.n_agents = len(obs_space.original_space.spaces) - config["model"]["n_agents"] = self.n_agents - self.n_actions = action_space.spaces[0].n - self.h_size = config["model"]["lstm_cell_size"] - self.has_env_global_state = False - self.has_action_mask = False - - agent_obs_space = obs_space.original_space.spaces[0] - if isinstance(agent_obs_space, gym.spaces.Dict): - space_keys = set(agent_obs_space.spaces.keys()) - if "obs" not in space_keys: - raise ValueError("Dict obs space must have subspace labeled `obs`") - self.obs_size = _get_size(agent_obs_space.spaces["obs"]) - if "action_mask" in space_keys: - mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape) - if mask_shape != (self.n_actions,): - raise ValueError( - "Action mask shape must be {}, got {}".format( - (self.n_actions,), mask_shape - ) - ) - self.has_action_mask = True - if ENV_STATE in space_keys: - self.env_global_state_shape = _get_size( - agent_obs_space.spaces[ENV_STATE] - ) - self.has_env_global_state = True - else: - self.env_global_state_shape = (self.obs_size, self.n_agents) - # The real agent obs space is nested inside the dict - config["model"]["full_obs_space"] = agent_obs_space - agent_obs_space = agent_obs_space.spaces["obs"] - else: - self.obs_size = _get_size(agent_obs_space) - self.env_global_state_shape = (self.obs_size, self.n_agents) - - self.model = ModelCatalog.get_model_v2( - agent_obs_space, - action_space.spaces[0], - self.n_actions, - config["model"], - framework="torch", - name="model", - default_model=RNNModel, - ) - - super().__init__(obs_space, action_space, config, model=self.model) - - self.target_model = ModelCatalog.get_model_v2( - agent_obs_space, - action_space.spaces[0], - self.n_actions, - config["model"], - framework="torch", - name="target_model", - default_model=RNNModel, - ).to(self.device) - - self.exploration = self._create_exploration() - - # Setup the mixer network. - if config["mixer"] is None: - self.mixer = None - self.target_mixer = None - elif config["mixer"] == "qmix": - self.mixer = QMixer( - self.n_agents, self.env_global_state_shape, config["mixing_embed_dim"] - ).to(self.device) - self.target_mixer = QMixer( - self.n_agents, self.env_global_state_shape, config["mixing_embed_dim"] - ).to(self.device) - elif config["mixer"] == "vdn": - self.mixer = VDNMixer().to(self.device) - self.target_mixer = VDNMixer().to(self.device) - else: - raise ValueError("Unknown mixer type {}".format(config["mixer"])) - - self.cur_epsilon = 1.0 - self.update_target() # initial sync - - # Setup optimizer - self.params = list(self.model.parameters()) - if self.mixer: - self.params += list(self.mixer.parameters()) - self.loss = QMixLoss( - self.model, - self.target_model, - self.mixer, - self.target_mixer, - self.n_agents, - self.n_actions, - self.config["double_q"], - self.config["gamma"], - ) - from torch.optim import RMSprop - - self.rmsprop_optimizer = RMSprop( - params=self.params, - lr=config["lr"], - alpha=config["optim_alpha"], - eps=config["optim_eps"], - ) - - @override(TorchPolicy) - def compute_actions_from_input_dict( - self, - input_dict: Dict[str, TensorType], - explore: bool = None, - timestep: Optional[int] = None, - **kwargs, - ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: - - obs_batch = input_dict[SampleBatch.OBS] - state_batches = [] - i = 0 - while f"state_in_{i}" in input_dict: - state_batches.append(input_dict[f"state_in_{i}"]) - i += 1 - - explore = explore if explore is not None else self.config["explore"] - obs_batch, action_mask, _ = self._unpack_observation(obs_batch) - # We need to ensure we do not use the env global state - # to compute actions - - # Compute actions - with torch.no_grad(): - q_values, hiddens = _mac( - self.model, - torch.as_tensor(obs_batch, dtype=torch.float, device=self.device), - [ - torch.as_tensor(np.array(s), dtype=torch.float, device=self.device) - for s in state_batches - ], - ) - avail = torch.as_tensor(action_mask, dtype=torch.float, device=self.device) - masked_q_values = q_values.clone() - masked_q_values[avail == 0.0] = -float("inf") - masked_q_values_folded = torch.reshape( - masked_q_values, [-1] + list(masked_q_values.shape)[2:] - ) - actions, _ = self.exploration.get_exploration_action( - action_distribution=TorchCategorical(masked_q_values_folded), - timestep=timestep, - explore=explore, - ) - actions = ( - torch.reshape(actions, list(masked_q_values.shape)[:-1]).cpu().numpy() - ) - hiddens = [s.cpu().numpy() for s in hiddens] - - return tuple(actions.transpose([1, 0])), hiddens, {} - - @override(TorchPolicy) - def compute_actions(self, *args, **kwargs): - return self.compute_actions_from_input_dict(*args, **kwargs) - - @override(TorchPolicy) - def compute_log_likelihoods( - self, - actions, - obs_batch, - state_batches=None, - prev_action_batch=None, - prev_reward_batch=None, - **kwargs, - ): - obs_batch, action_mask, _ = self._unpack_observation(obs_batch) - return np.zeros(obs_batch.size()[0]) - - @override(TorchPolicy) - def learn_on_batch(self, samples): - obs_batch, action_mask, env_global_state = self._unpack_observation( - samples[SampleBatch.CUR_OBS] - ) - ( - next_obs_batch, - next_action_mask, - next_env_global_state, - ) = self._unpack_observation(samples[SampleBatch.NEXT_OBS]) - group_rewards = self._get_group_rewards(samples[SampleBatch.INFOS]) - - input_list = [ - group_rewards, - action_mask, - next_action_mask, - samples[SampleBatch.ACTIONS], - samples[SampleBatch.TERMINATEDS], - obs_batch, - next_obs_batch, - ] - if self.has_env_global_state: - input_list.extend([env_global_state, next_env_global_state]) - - output_list, _, seq_lens = chop_into_sequences( - episode_ids=samples[SampleBatch.EPS_ID], - unroll_ids=samples[SampleBatch.UNROLL_ID], - agent_indices=samples[SampleBatch.AGENT_INDEX], - feature_columns=input_list, - state_columns=[], # RNN states not used here - max_seq_len=self.config["model"]["max_seq_len"], - dynamic_max=True, - ) - # These will be padded to shape [B * T, ...] - if self.has_env_global_state: - ( - rew, - action_mask, - next_action_mask, - act, - terminateds, - obs, - next_obs, - env_global_state, - next_env_global_state, - ) = output_list - else: - ( - rew, - action_mask, - next_action_mask, - act, - terminateds, - obs, - next_obs, - ) = output_list - B, T = len(seq_lens), max(seq_lens) - - def to_batches(arr, dtype): - new_shape = [B, T] + list(arr.shape[1:]) - return torch.as_tensor( - np.reshape(arr, new_shape), dtype=dtype, device=self.device - ) - - rewards = to_batches(rew, torch.float) - actions = to_batches(act, torch.long) - obs = to_batches(obs, torch.float).reshape([B, T, self.n_agents, self.obs_size]) - action_mask = to_batches(action_mask, torch.float) - next_obs = to_batches(next_obs, torch.float).reshape( - [B, T, self.n_agents, self.obs_size] - ) - next_action_mask = to_batches(next_action_mask, torch.float) - if self.has_env_global_state: - env_global_state = to_batches(env_global_state, torch.float) - next_env_global_state = to_batches(next_env_global_state, torch.float) - - # TODO(ekl) this treats group termination as individual termination - terminated = ( - to_batches(terminateds, torch.float) - .unsqueeze(2) - .expand(B, T, self.n_agents) - ) - - # Create mask for where index is < unpadded sequence length - filled = np.reshape( - np.tile(np.arange(T, dtype=np.float32), B), [B, T] - ) < np.expand_dims(seq_lens, 1) - mask = ( - torch.as_tensor(filled, dtype=torch.float, device=self.device) - .unsqueeze(2) - .expand(B, T, self.n_agents) - ) - - # Compute loss - loss_out, mask, masked_td_error, chosen_action_qvals, targets = self.loss( - rewards, - actions, - terminated, - mask, - obs, - next_obs, - action_mask, - next_action_mask, - env_global_state, - next_env_global_state, - ) - - # Optimise - self.rmsprop_optimizer.zero_grad() - loss_out.backward() - grad_norm_info = apply_grad_clipping(self, self.rmsprop_optimizer, loss_out) - self.rmsprop_optimizer.step() - - mask_elems = mask.sum().item() - stats = { - "loss": loss_out.item(), - "td_error_abs": masked_td_error.abs().sum().item() / mask_elems, - "q_taken_mean": (chosen_action_qvals * mask).sum().item() / mask_elems, - "target_mean": (targets * mask).sum().item() / mask_elems, - } - stats.update(grad_norm_info) - - return {LEARNER_STATS_KEY: stats} - - @override(TorchPolicy) - def get_initial_state(self): # initial RNN state - return [ - s.expand([self.n_agents, -1]).cpu().numpy() - for s in self.model.get_initial_state() - ] - - @override(TorchPolicy) - def get_weights(self): - return { - "model": self._cpu_dict(self.model.state_dict()), - "target_model": self._cpu_dict(self.target_model.state_dict()), - "mixer": self._cpu_dict(self.mixer.state_dict()) if self.mixer else None, - "target_mixer": self._cpu_dict(self.target_mixer.state_dict()) - if self.mixer - else None, - } - - @override(TorchPolicy) - def set_weights(self, weights): - self.model.load_state_dict(self._device_dict(weights["model"])) - self.target_model.load_state_dict(self._device_dict(weights["target_model"])) - if weights["mixer"] is not None: - self.mixer.load_state_dict(self._device_dict(weights["mixer"])) - self.target_mixer.load_state_dict( - self._device_dict(weights["target_mixer"]) - ) - - @override(TorchPolicy) - def get_state(self): - state = self.get_weights() - state["cur_epsilon"] = self.cur_epsilon - return state - - @override(TorchPolicy) - def set_state(self, state): - self.set_weights(state) - self.set_epsilon(state["cur_epsilon"]) - - def update_target(self): - self.target_model.load_state_dict(self.model.state_dict()) - if self.mixer is not None: - self.target_mixer.load_state_dict(self.mixer.state_dict()) - logger.debug("Updated target networks") - - def set_epsilon(self, epsilon): - self.cur_epsilon = epsilon - - def _get_group_rewards(self, info_batch): - group_rewards = np.array( - [info.get(GROUP_REWARDS, [0.0] * self.n_agents) for info in info_batch] - ) - return group_rewards - - def _device_dict(self, state_dict): - return { - k: torch.as_tensor(v, device=self.device) for k, v in state_dict.items() - } - - @staticmethod - def _cpu_dict(state_dict): - return {k: v.cpu().detach().numpy() for k, v in state_dict.items()} - - def _unpack_observation(self, obs_batch): - """Unpacks the observation, action mask, and state (if present) - from agent grouping. - - Returns: - obs (np.ndarray): obs tensor of shape [B, n_agents, obs_size] - mask (np.ndarray): action mask, if any - state (np.ndarray or None): state tensor of shape [B, state_size] - or None if it is not in the batch - """ - - unpacked = _unpack_obs( - np.array(obs_batch, dtype=np.float32), - self.observation_space.original_space, - tensorlib=np, - ) - - if isinstance(unpacked[0], dict): - assert "obs" in unpacked[0] - unpacked_obs = [np.concatenate(tree.flatten(u["obs"]), 1) for u in unpacked] - else: - unpacked_obs = unpacked - - obs = np.concatenate(unpacked_obs, axis=1).reshape( - [len(obs_batch), self.n_agents, self.obs_size] - ) - - if self.has_action_mask: - action_mask = np.concatenate( - [o["action_mask"] for o in unpacked], axis=1 - ).reshape([len(obs_batch), self.n_agents, self.n_actions]) - else: - action_mask = np.ones( - [len(obs_batch), self.n_agents, self.n_actions], dtype=np.float32 - ) - - if self.has_env_global_state: - state = np.concatenate(tree.flatten(unpacked[0][ENV_STATE]), 1) - else: - state = None - return obs, action_mask, state - - -def _validate(obs_space, action_space): - if not hasattr(obs_space, "original_space") or not isinstance( - obs_space.original_space, gym.spaces.Tuple - ): - raise ValueError( - "Obs space must be a Tuple, got {}. Use ".format(obs_space) - + "MultiAgentEnv.with_agent_groups() to group related " - "agents for QMix." - ) - if not isinstance(action_space, gym.spaces.Tuple): - raise ValueError( - "Action space must be a Tuple, got {}. ".format(action_space) - + "Use MultiAgentEnv.with_agent_groups() to group related " - "agents for QMix." - ) - if not isinstance(action_space.spaces[0], gym.spaces.Discrete): - raise ValueError( - "QMix requires a discrete action space, got {}".format( - action_space.spaces[0] - ) - ) - if len({str(x) for x in obs_space.original_space.spaces}) > 1: - raise ValueError( - "Implementation limitation: observations of grouped agents " - "must be homogeneous, got {}".format(obs_space.original_space.spaces) - ) - if len({str(x) for x in action_space.spaces}) > 1: - raise ValueError( - "Implementation limitation: action space of grouped agents " - "must be homogeneous, got {}".format(action_space.spaces) - ) - - -def _mac(model, obs, h): - """Forward pass of the multi-agent controller. - - Args: - model: TorchModelV2 class - obs: Tensor of shape [B, n_agents, obs_size] - h: List of tensors of shape [B, n_agents, h_size] - - Returns: - q_vals: Tensor of shape [B, n_agents, n_actions] - h: Tensor of shape [B, n_agents, h_size] - """ - B, n_agents = obs.size(0), obs.size(1) - if not isinstance(obs, dict): - obs = {"obs": obs} - obs_agents_as_batches = {k: _drop_agent_dim(v) for k, v in obs.items()} - h_flat = [s.reshape([B * n_agents, -1]) for s in h] - q_flat, h_flat = model(obs_agents_as_batches, h_flat, None) - return q_flat.reshape([B, n_agents, -1]), [ - s.reshape([B, n_agents, -1]) for s in h_flat - ] - - -def _unroll_mac(model, obs_tensor): - """Computes the estimated Q values for an entire trajectory batch""" - B = obs_tensor.size(0) - T = obs_tensor.size(1) - n_agents = obs_tensor.size(2) - - mac_out = [] - h = [s.expand([B, n_agents, -1]) for s in model.get_initial_state()] - for t in range(T): - q, h = _mac(model, obs_tensor[:, t], h) - mac_out.append(q) - mac_out = torch.stack(mac_out, dim=1) # Concat over time - - return mac_out - - -def _drop_agent_dim(T): - shape = list(T.shape) - B, n_agents = shape[0], shape[1] - return T.reshape([B * n_agents] + shape[2:]) - - -def _add_agent_dim(T, n_agents): - shape = list(T.shape) - B = shape[0] // n_agents - assert shape[0] % n_agents == 0 - return T.reshape([B, n_agents] + shape[1:]) diff --git a/rllib_contrib/qmix/tests/test_qmix.py b/rllib_contrib/qmix/tests/test_qmix.py deleted file mode 100644 index c6479daa89b24..0000000000000 --- a/rllib_contrib/qmix/tests/test_qmix.py +++ /dev/null @@ -1,125 +0,0 @@ -import unittest - -import numpy as np -from gymnasium.spaces import Box, Dict, Discrete, MultiDiscrete, Tuple -from rllib_qmix.qmix import QMixConfig - -import ray -from ray.rllib.env.multi_agent_env import MultiAgentEnv -from ray.tune import register_env - - -class AvailActionsTestEnv(MultiAgentEnv): - num_actions = 10 - action_space = Discrete(num_actions) - observation_space = Dict( - { - "obs": Dict( - { - "test": Dict({"a": Discrete(2), "b": MultiDiscrete([2, 3, 4])}), - "state": MultiDiscrete([2, 2, 2]), - } - ), - "action_mask": Box(0, 1, (num_actions,)), - } - ) - - def __init__(self, env_config): - super().__init__() - self.state = None - self.avail = env_config.get("avail_actions", [3]) - self.action_mask = np.array([0] * 10) - for a in self.avail: - self.action_mask[a] = 1 - - def reset(self, *, seed=None, options=None): - self.state = 0 - return { - "agent_1": { - "obs": self.observation_space["obs"].sample(), - "action_mask": self.action_mask, - }, - "agent_2": { - "obs": self.observation_space["obs"].sample(), - "action_mask": self.action_mask, - }, - }, {} - - def step(self, action_dict): - if self.state > 0: - assert ( - action_dict["agent_1"] in self.avail - and action_dict["agent_2"] in self.avail - ), "Failed to obey available actions mask!" - self.state += 1 - rewards = {"agent_1": 1, "agent_2": 0.5} - obs = { - "agent_1": { - "obs": self.observation_space["obs"].sample(), - "action_mask": self.action_mask, - }, - "agent_2": { - "obs": self.observation_space["obs"].sample(), - "action_mask": self.action_mask, - }, - } - terminateds = {"__all__": False} - truncateds = {"__all__": self.state >= 20} - return obs, rewards, terminateds, truncateds, {} - - -class TestQMix(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_avail_actions_qmix(self): - grouping = { - "group_1": ["agent_1", "agent_2"], - } - obs_space = Tuple( - [ - AvailActionsTestEnv.observation_space, - AvailActionsTestEnv.observation_space, - ] - ) - act_space = Tuple( - [AvailActionsTestEnv.action_space, AvailActionsTestEnv.action_space] - ) - register_env( - "action_mask_test", - lambda config: AvailActionsTestEnv(config).with_agent_groups( - grouping, obs_space=obs_space, act_space=act_space - ), - ) - - config = ( - QMixConfig() - .framework(framework="torch") - .environment( - env="action_mask_test", - env_config={"avail_actions": [3, 4, 8]}, - ) - .rollouts(num_envs_per_worker=5) - ) # Test with vectorization on. - - algo = config.build() - - for _ in range(4): - algo.train() # OK if it doesn't trip the action assertion error - - assert algo.train()["episode_reward_mean"] == 30.0 - algo.stop() - ray.shutdown() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/qmix/tuned_examples/__init__.py b/rllib_contrib/qmix/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/qmix/tuned_examples/two-step-game-qmix-no-mixer.yaml b/rllib_contrib/qmix/tuned_examples/two-step-game-qmix-no-mixer.yaml deleted file mode 100644 index bb16308a2d2a5..0000000000000 --- a/rllib_contrib/qmix/tuned_examples/two-step-game-qmix-no-mixer.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -two-step-game-qmix-without-mixer: - env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents - run: QMIX - stop: - sampler_results/episode_reward_mean: 6.5 - timesteps_total: 70000 - config: - # QMIX only supports torch for now. - framework: torch - - env_config: - env_config: - separate_state_space: true - one_hot_state_encoding: true - - exploration_config: - final_epsilon: 0.0 - - rollout_fragment_length: 4 - train_batch_size: 32 - num_workers: 4 - mixer: null diff --git a/rllib_contrib/qmix/tuned_examples/two-step-game-qmix-vdn-mixer.yaml b/rllib_contrib/qmix/tuned_examples/two-step-game-qmix-vdn-mixer.yaml deleted file mode 100644 index 63247f4cd9b86..0000000000000 --- a/rllib_contrib/qmix/tuned_examples/two-step-game-qmix-vdn-mixer.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -two-step-game-qmix-with-vdn-mixer: - env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents - run: QMIX - stop: - sampler_results/episode_reward_mean: 6.5 - timesteps_total: 70000 - config: - # QMIX only supports torch for now. - framework: torch - - env_config: - env_config: - separate_state_space: true - one_hot_state_encoding: true - - exploration_config: - final_epsilon: 0.0 - - rollout_fragment_length: 8 - train_batch_size: 32 - num_workers: 4 - mixer: vdn diff --git a/rllib_contrib/qmix/tuned_examples/two-step-game-qmix.yaml b/rllib_contrib/qmix/tuned_examples/two-step-game-qmix.yaml deleted file mode 100644 index 1d003db3b64c1..0000000000000 --- a/rllib_contrib/qmix/tuned_examples/two-step-game-qmix.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -two-step-game-qmix-with-qmix-mixer: - env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents - run: QMIX - stop: - sampler_results/episode_reward_mean: 7.5 - timesteps_total: 70000 - config: - # QMIX only supports torch for now. - framework: torch - - env_config: - env_config: - separate_state_space: true - one_hot_state_encoding: true - - exploration_config: - final_epsilon: 0.0 - - rollout_fragment_length: 4 - train_batch_size: 32 - num_workers: 4 - mixer: qmix diff --git a/rllib_contrib/r2d2/BUILD b/rllib_contrib/r2d2/BUILD deleted file mode 100644 index 7c49e1433c370..0000000000000 --- a/rllib_contrib/r2d2/BUILD +++ /dev/null @@ -1,41 +0,0 @@ -# Examples - -py_test( - name = "example_r2d2_stateless_cartpole", - main = "r2d2_stateless_cartpole.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/r2d2_stateless_cartpole.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_stateless_cartpole_r2d2", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib"], - size = "enormous", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/stateless-cartpole-r2d2.yaml"], - args = ["--dir=r2d2/tuned_examples/"] -) - -py_test( - name = "learning_tests_stateless_cartpole_r2d2_fake_gpus", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "no_tf_eager_tracing"], - size = "enormous", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/stateless-cartpole-r2d2-fake-gpus.yaml"], - args = ["--dir=r2d2/tuned_examples/"] -) - -# Compilation Tests - -py_test( - name = "test_r2d2", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_r2d2.py"] -) diff --git a/rllib_contrib/r2d2/README.md b/rllib_contrib/r2d2/README.md deleted file mode 100644 index d2bdc44d93d38..0000000000000 --- a/rllib_contrib/r2d2/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# R2D2 (Recurrent Experience Replay In Distributed Reinforcement Learning) - -[R2D2](https://openreview.net/pdf?id=r1lyTjAqYX) is a version of DQN that is adapted to enable -RNN based policies. - -## Installation - -``` -conda create -n rllib-r2d2 python=3.10 -conda activate rllib-r2d2 -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[R2D2 Example]() \ No newline at end of file diff --git a/rllib_contrib/r2d2/examples/r2d2_stateless_cartpole.py b/rllib_contrib/r2d2/examples/r2d2_stateless_cartpole.py deleted file mode 100644 index e2b7b8fed26d8..0000000000000 --- a/rllib_contrib/r2d2/examples/r2d2_stateless_cartpole.py +++ /dev/null @@ -1,73 +0,0 @@ -import argparse - -from rllib_r2d2.r2d2.r2d2 import R2D2, R2D2Config - -import ray -from ray import air, tune -from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import register_env - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - ray.init() - - register_env("stateless_cartpole", lambda env_cfg: StatelessCartPole(env_cfg)) - - config = ( - R2D2Config() - .environment(env="stateless_cartpole") - .rollouts(num_rollout_workers=0) - .training( - model={ - # Wrap with an LSTM and use a very simple base-model. - "use_lstm": True, - "max_seq_len": 20, - "fcnet_hiddens": [64], - "lstm_cell_size": 64, - "fcnet_activation": "linear", - }, - dueling=False, - lr=5e-4, - zero_init_states=True, - replay_buffer_config={ - "type": "MultiAgentReplayBuffer", - "storage_unit": "sequences", - "replay_burn_in": 20, - "zero_init_states": True, - }, - num_steps_sampled_before_learning_starts=0, - ) - .exploration(exploration_config={"epsilon_timesteps": 50000}) - ) - - stop_reward = 150 - - stop = { - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 1000000, - } - - tuner = tune.Tuner( - R2D2, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop=stop, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - if args.run_as_test: - check_learning_achieved(results, stop_reward) - - ray.shutdown() diff --git a/rllib_contrib/r2d2/pyproject.toml b/rllib_contrib/r2d2/pyproject.toml deleted file mode 100644 index ccd4119da7d8d..0000000000000 --- a/rllib_contrib/r2d2/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-r2d2" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium==0.26.3", "ray[rllib]==2.5.0"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/r2d2/requirements.txt b/rllib_contrib/r2d2/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/r2d2/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/__init__.py b/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/__init__.py deleted file mode 100644 index 993a9d58bf212..0000000000000 --- a/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright 2023-onwards Anyscale, Inc. The use of this library is subject to the -# included LICENSE file. -from rllib_r2d2.r2d2.r2d2 import R2D2, R2D2Config - -from ray.tune.registry import register_trainable - -__all__ = [ - "R2D2", - "R2D2Config", -] - -register_trainable("rllib-contrib-r2d2", R2D2) diff --git a/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2.py b/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2.py deleted file mode 100644 index 946b12b9b6822..0000000000000 --- a/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2.py +++ /dev/null @@ -1,223 +0,0 @@ -import logging -from typing import Optional, Type - -from rllib_r2d2.r2d2.r2d2_tf_policy import R2D2TFPolicy -from rllib_r2d2.r2d2.r2d2_torch_policy import R2D2TorchPolicy - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.dqn import DQN, DQNConfig -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE - -logger = logging.getLogger(__name__) - - -class R2D2Config(DQNConfig): - r"""Defines a configuration class from which a R2D2 Algorithm can be built. - - Example: - >>> from rllib_r2d2.r2d2.r2d2 import R2D2Config - >>> config = R2D2Config() - >>> print(config.h_function_epsilon) # doctest: +SKIP - >>> replay_config = config.replay_buffer_config.update( - >>> { - >>> "capacity": 1000000, - >>> "replay_burn_in": 20, - >>> } - >>> ) - >>> config.training(replay_buffer_config=replay_config)\ # doctest: +SKIP - >>> .resources(num_gpus=1)\ - >>> .rollouts(num_rollout_workers=30)\ - >>> .environment("CartPole-v1") - >>> algo = R2D2(config=config) # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from rllib_r2d2.r2d2.r2d2 import R2D2Config - >>> from ray import air - >>> from ray import tune - >>> config = R2D2Config() - >>> config.training(train_batch_size=tune.grid_search([256, 64]) - >>> config.environment(env="CartPole-v1") - >>> tune.Tuner( # doctest: +SKIP - ... "R2D2", - ... run_config=air.RunConfig(stop={"episode_reward_mean":200}), - ... param_space=config.to_dict() - ... ).fit() - - Example: - >>> from rllib_r2d2.r2d2.r2d2 import R2D2Config - >>> config = R2D2Config() - >>> print(config.exploration_config) # doctest: +SKIP - >>> explore_config = config.exploration_config.update( - >>> { - >>> "initial_epsilon": 1.0, - >>> "final_epsilon": 0.1, - >>> "epsilone_timesteps": 200000, - >>> } - >>> ) - >>> config.training(lr_schedule=[[1, 1e-3, [500, 5e-3]])\ - >>> .exploration(exploration_config=explore_config) - - Example: - >>> from rllib_r2d2.r2d2.r2d2 import R2D2Config - >>> config = R2D2Config() - >>> print(config.exploration_config) # doctest: +SKIP - >>> explore_config = config.exploration_config.update( - >>> { - >>> "type": "SoftQ", - >>> "temperature": [1.0], - >>> } - >>> ) - >>> config.training(lr_schedule=[[1, 1e-3, [500, 5e-3]])\ - >>> .exploration(exploration_config=explore_config) - """ - - def __init__(self, algo_class=None): - """Initializes a ApexConfig instance.""" - super().__init__(algo_class=algo_class or R2D2) - - # fmt: off - # __sphinx_doc_begin__ - # R2D2-specific settings: - self.zero_init_states = True - self.use_h_function = True - self.h_function_epsilon = 1e-3 - - # R2D2 settings overriding DQN ones: - # .training() - self.adam_epsilon = 1e-3 - self.lr = 1e-4 - self.gamma = 0.997 - self.train_batch_size = 1000 - self.target_network_update_freq = 1000 - self.training_intensity = 150 - # R2D2 is using a buffer that stores sequences. - self.replay_buffer_config = { - "type": "MultiAgentReplayBuffer", - # Specify prioritized replay by supplying a buffer type that supports - # prioritization, for example: MultiAgentPrioritizedReplayBuffer. - "prioritized_replay": DEPRECATED_VALUE, - # Size of the replay buffer (in sequences, not timesteps). - "capacity": 100000, - # This algorithm learns on sequences. We therefore require the replay buffer - # to slice sampled batches into sequences before replay. How sequences - # are sliced depends on the parameters `replay_sequence_length`, - # `replay_burn_in`, and `replay_zero_init_states`. - "storage_unit": "sequences", - # Set automatically: The number - # of contiguous environment steps to - # replay at once. Will be calculated via - # model->max_seq_len + burn_in. - # Do not set this to any valid value! - "replay_sequence_length": -1, - # If > 0, use the `replay_burn_in` first steps of each replay-sampled - # sequence (starting either from all 0.0-values if `zero_init_state=True` or - # from the already stored values) to calculate an even more accurate - # initial states for the actual sequence (starting after this burn-in - # window). In the burn-in case, the actual length of the sequence - # used for loss calculation is `n - replay_burn_in` time steps - # (n=LSTM’s/attention net’s max_seq_len). - "replay_burn_in": 0, - } - - # .rollouts() - self.num_rollout_workers = 2 - self.batch_mode = "complete_episodes" - - # fmt: on - # __sphinx_doc_end__ - - self.burn_in = DEPRECATED_VALUE - - def training( - self, - *, - zero_init_states: Optional[bool] = NotProvided, - use_h_function: Optional[bool] = NotProvided, - h_function_epsilon: Optional[float] = NotProvided, - **kwargs, - ) -> "R2D2Config": - """Sets the training related configuration. - - Args: - zero_init_states: If True, assume a zero-initialized state input (no - matter where in the episode the sequence is located). - If False, store the initial states along with each SampleBatch, use - it (as initial state when running through the network for training), - and update that initial state during training (from the internal - state outputs of the immediately preceding sequence). - use_h_function: Whether to use the h-function from the paper [1] to scale - target values in the R2D2-loss function: - h(x) = sign(x)(􏰅|x| + 1 − 1) + εx - h_function_epsilon: The epsilon parameter from the R2D2 loss function (only - used if `use_h_function`=True. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if zero_init_states is not NotProvided: - self.zero_init_states = zero_init_states - if use_h_function is not NotProvided: - self.use_h_function = use_h_function - if h_function_epsilon is not NotProvided: - self.h_function_epsilon = h_function_epsilon - - return self - - @override(DQNConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if ( - not self.in_evaluation - and self.replay_buffer_config.get("replay_sequence_length", -1) != -1 - ): - raise ValueError( - "`replay_sequence_length` is calculated automatically to be " - "model->max_seq_len + burn_in!" - ) - # Add the `burn_in` to the Model's max_seq_len. - # Set the replay sequence length to the max_seq_len of the model. - self.replay_buffer_config["replay_sequence_length"] = ( - self.replay_buffer_config["replay_burn_in"] + self.model["max_seq_len"] - ) - - if self.batch_mode != "complete_episodes": - raise ValueError("`batch_mode` must be 'complete_episodes'!") - - -class R2D2(DQN): - """Recurrent Experience Replay in Distrib. Reinforcement Learning (R2D2). - - Algorithm defining the distributed R2D2 algorithm. - See `r2d2_[tf|torch]_policy.py` for the definition of the policies. - - [1] Recurrent Experience Replay in Distributed Reinforcement Learning - - S Kapturowski, G Ostrovski, J Quan, R Munos, W Dabney - 2019, DeepMind - - - Detailed documentation: - https://docs.ray.io/en/master/rllib-algorithms.html#\ - recurrent-replay-distributed-dqn-r2d2 - """ - - @classmethod - @override(DQN) - def get_default_config(cls) -> AlgorithmConfig: - return R2D2Config() - - @classmethod - @override(DQN) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - return R2D2TorchPolicy - else: - return R2D2TFPolicy diff --git a/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2_tf_policy.py b/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2_tf_policy.py deleted file mode 100644 index ccefe09c29e58..0000000000000 --- a/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2_tf_policy.py +++ /dev/null @@ -1,354 +0,0 @@ -"""TensorFlow policy class used for R2D2.""" - -from typing import Dict, List, Optional, Tuple - -import gymnasium as gym - -import ray -from ray.rllib.algorithms.dqn.dqn_tf_policy import ( - PRIO_WEIGHTS, - build_q_model, - clip_gradients, - compute_q_values, - postprocess_nstep_and_prio, -) -from ray.rllib.models.action_dist import ActionDistribution -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_action_dist import Categorical -from ray.rllib.models.torch.torch_action_dist import TorchCategorical -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import LearningRateSchedule, TargetNetworkMixin -from ray.rllib.policy.tf_policy_template import build_tf_policy -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import huber_loss -from ray.rllib.utils.typing import AlgorithmConfigDict, ModelInputDict, TensorType - -tf1, tf, tfv = try_import_tf() - - -def build_r2d2_model( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> Tuple[ModelV2, ActionDistribution]: - """Build q_model and target_model for DQN - - Args: - policy: The policy, which will use the model for optimization. - obs_space (gym.spaces.Space): The policy's observation space. - action_space (gym.spaces.Space): The policy's action space. - config (AlgorithmConfigDict): - - Returns: - q_model - Note: The target q model will not be returned, just assigned to - `policy.target_model`. - """ - - # Create the policy's models. - model = build_q_model(policy, obs_space, action_space, config) - - # Assert correct model type by checking the init state to be present. - # For attention nets: These don't necessarily publish their init state via - # Model.get_initial_state, but may only use the trajectory view API - # (view_requirements). - assert ( - model.get_initial_state() != [] - or model.view_requirements.get("state_in_0") is not None - ), ( - "R2D2 requires its model to be a recurrent one! Try using " - "`model.use_lstm` or `model.use_attention` in your config " - "to auto-wrap your model with an LSTM- or attention net." - ) - - return model - - -def r2d2_loss(policy: Policy, model, _, train_batch: SampleBatch) -> TensorType: - """Constructs the loss for R2D2TFPolicy. - - Args: - policy: The Policy to calculate the loss for. - model (ModelV2): The Model to calculate the loss for. - train_batch: The training data. - - Returns: - TensorType: A single loss tensor. - """ - config = policy.config - - # Construct internal state inputs. - i = 0 - state_batches = [] - while "state_in_{}".format(i) in train_batch: - state_batches.append(train_batch["state_in_{}".format(i)]) - i += 1 - assert state_batches - - # Q-network evaluation (at t). - q, _, _, _ = compute_q_values( - policy, - model, - train_batch, - state_batches=state_batches, - seq_lens=train_batch.get(SampleBatch.SEQ_LENS), - explore=False, - is_training=True, - ) - - # Target Q-network evaluation (at t+1). - q_target, _, _, _ = compute_q_values( - policy, - policy.target_model, - train_batch, - state_batches=state_batches, - seq_lens=train_batch.get(SampleBatch.SEQ_LENS), - explore=False, - is_training=True, - ) - - if not hasattr(policy, "target_q_func_vars"): - policy.target_q_func_vars = policy.target_model.variables() - - actions = tf.cast(train_batch[SampleBatch.ACTIONS], tf.int64) - dones = tf.cast(train_batch[SampleBatch.TERMINATEDS], tf.float32) - rewards = train_batch[SampleBatch.REWARDS] - weights = tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) - - B = tf.shape(state_batches[0])[0] - T = tf.shape(q)[0] // B - - # Q scores for actions which we know were selected in the given state. - one_hot_selection = tf.one_hot(actions, policy.action_space.n) - q_selected = tf.reduce_sum( - tf.where(q > tf.float32.min, q, tf.zeros_like(q)) * one_hot_selection, axis=1 - ) - - if config["double_q"]: - best_actions = tf.argmax(q, axis=1) - else: - best_actions = tf.argmax(q_target, axis=1) - - best_actions_one_hot = tf.one_hot(best_actions, policy.action_space.n) - q_target_best = tf.reduce_sum( - tf.where(q_target > tf.float32.min, q_target, tf.zeros_like(q_target)) - * best_actions_one_hot, - axis=1, - ) - - if config["num_atoms"] > 1: - raise ValueError("Distributional R2D2 not supported yet!") - else: - q_target_best_masked_tp1 = (1.0 - dones) * tf.concat( - [q_target_best[1:], tf.constant([0.0])], axis=0 - ) - - if config["use_h_function"]: - h_inv = h_inverse(q_target_best_masked_tp1, config["h_function_epsilon"]) - target = h_function( - rewards + config["gamma"] ** config["n_step"] * h_inv, - config["h_function_epsilon"], - ) - else: - target = ( - rewards + config["gamma"] ** config["n_step"] * q_target_best_masked_tp1 - ) - - # Seq-mask all loss-related terms. - seq_mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], T)[:, :-1] - # Mask away also the burn-in sequence at the beginning. - burn_in = policy.config["replay_buffer_config"]["replay_burn_in"] - # Making sure, this works for both static graph and eager. - if burn_in > 0: - seq_mask = tf.cond( - pred=tf.convert_to_tensor(burn_in, tf.int32) < T, - true_fn=lambda: tf.concat( - [tf.fill([B, burn_in], False), seq_mask[:, burn_in:]], 1 - ), - false_fn=lambda: seq_mask, - ) - - def reduce_mean_valid(t): - return tf.reduce_mean(tf.boolean_mask(t, seq_mask)) - - # Make sure to use the correct time indices: - # Q(t) - [gamma * r + Q^(t+1)] - q_selected = tf.reshape(q_selected, [B, T])[:, :-1] - td_error = q_selected - tf.stop_gradient(tf.reshape(target, [B, T])[:, :-1]) - td_error = td_error * tf.cast(seq_mask, tf.float32) - weights = tf.reshape(weights, [B, T])[:, :-1] - policy._total_loss = reduce_mean_valid(weights * huber_loss(td_error)) - # Store the TD-error per time chunk (b/c we need only one mean - # prioritized replay weight per stored sequence). - policy._td_error = tf.reduce_mean(td_error, axis=-1) - policy._loss_stats = { - "mean_q": reduce_mean_valid(q_selected), - "min_q": tf.reduce_min(q_selected), - "max_q": tf.reduce_max(q_selected), - "mean_td_error": reduce_mean_valid(td_error), - } - - return policy._total_loss - - -def h_function(x, epsilon=1.0): - """h-function to normalize target Qs, described in the paper [1]. - - h(x) = sign(x) * [sqrt(abs(x) + 1) - 1] + epsilon * x - - Used in [1] in combination with h_inverse: - targets = h(r + gamma * h_inverse(Q^)) - """ - return tf.sign(x) * (tf.sqrt(tf.abs(x) + 1.0) - 1.0) + epsilon * x - - -def h_inverse(x, epsilon=1.0): - """Inverse if the above h-function, described in the paper [1]. - - If x > 0.0: - h-1(x) = [2eps * x + (2eps + 1) - sqrt(4eps x + (2eps + 1)^2)] / - (2 * eps^2) - - If x < 0.0: - h-1(x) = [2eps * x + (2eps + 1) + sqrt(-4eps x + (2eps + 1)^2)] / - (2 * eps^2) - """ - two_epsilon = epsilon * 2 - if_x_pos = ( - two_epsilon * x - + (two_epsilon + 1.0) - - tf.sqrt(4.0 * epsilon * x + (two_epsilon + 1.0) ** 2) - ) / (2.0 * epsilon**2) - if_x_neg = ( - two_epsilon * x - - (two_epsilon + 1.0) - + tf.sqrt(-4.0 * epsilon * x + (two_epsilon + 1.0) ** 2) - ) / (2.0 * epsilon**2) - return tf.where(x < 0.0, if_x_neg, if_x_pos) - - -class ComputeTDErrorMixin: - """Assign the `compute_td_error` method to the R2D2TFPolicy - - This allows us to prioritize on the worker side. - """ - - def __init__(self): - def compute_td_error( - obs_t, act_t, rew_t, obs_tp1, terminateds_mask, importance_weights - ): - input_dict = self._lazy_tensor_dict({SampleBatch.CUR_OBS: obs_t}) - input_dict[SampleBatch.ACTIONS] = act_t - input_dict[SampleBatch.REWARDS] = rew_t - input_dict[SampleBatch.NEXT_OBS] = obs_tp1 - input_dict[SampleBatch.TERMINATEDS] = terminateds_mask - input_dict[PRIO_WEIGHTS] = importance_weights - - # Do forward pass on loss to update td error attribute - r2d2_loss(self, self.model, None, input_dict) - - return self._td_error - - self.compute_td_error = compute_td_error - - -def get_distribution_inputs_and_class( - policy: Policy, - model: ModelV2, - *, - input_dict: ModelInputDict, - state_batches: Optional[List[TensorType]] = None, - seq_lens: Optional[TensorType] = None, - explore: bool = True, - is_training: bool = False, - **kwargs -) -> Tuple[TensorType, type, List[TensorType]]: - - if policy.config["framework"] == "torch": - from ray.rllib.algorithms.r2d2.r2d2_torch_policy import ( - compute_q_values as torch_compute_q_values, - ) - - func = torch_compute_q_values - else: - func = compute_q_values - - q_vals, logits, probs_or_logits, state_out = func( - policy, model, input_dict, state_batches, seq_lens, explore, is_training - ) - - policy.q_values = q_vals - if not hasattr(policy, "q_func_vars"): - policy.q_func_vars = model.variables() - - action_dist_class = ( - TorchCategorical if policy.config["framework"] == "torch" else Categorical - ) - - return policy.q_values, action_dist_class, state_out - - -def adam_optimizer( - policy: Policy, config: AlgorithmConfigDict -) -> "tf.keras.optimizers.Optimizer": - return tf1.train.AdamOptimizer( - learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"] - ) - - -def build_q_stats(policy: Policy, batch) -> Dict[str, TensorType]: - return dict( - { - "cur_lr": policy.cur_lr, - }, - **policy._loss_stats - ) - - -def setup_early_mixins( - policy: Policy, obs_space, action_space, config: AlgorithmConfigDict -) -> None: - LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) - - -def before_loss_init( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> None: - ComputeTDErrorMixin.__init__(policy) - - -def setup_late_mixins( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> None: - TargetNetworkMixin.__init__(policy) - - -R2D2TFPolicy = build_tf_policy( - name="R2D2TFPolicy", - loss_fn=r2d2_loss, - get_default_config=lambda: ray.rllib.algorithms.r2d2.r2d2.R2D2Config(), - postprocess_fn=postprocess_nstep_and_prio, - stats_fn=build_q_stats, - make_model=build_r2d2_model, - action_distribution_fn=get_distribution_inputs_and_class, - optimizer_fn=adam_optimizer, - extra_action_out_fn=lambda policy: {"q_values": policy.q_values}, - compute_gradients_fn=clip_gradients, - extra_learn_fetches_fn=lambda policy: {"td_error": policy._td_error}, - before_init=setup_early_mixins, - before_loss_init=before_loss_init, - after_init=setup_late_mixins, - mixins=[ - TargetNetworkMixin, - ComputeTDErrorMixin, - LearningRateSchedule, - ], -) diff --git a/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2_torch_policy.py b/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2_torch_policy.py deleted file mode 100644 index 54abe6bfbfbaf..0000000000000 --- a/rllib_contrib/r2d2/src/rllib_r2d2/r2d2/r2d2_torch_policy.py +++ /dev/null @@ -1,331 +0,0 @@ -"""PyTorch policy class used for R2D2.""" - -from typing import Dict, Tuple - -import gymnasium as gym - -import ray -from ray.rllib.algorithms.dqn.dqn_tf_policy import ( - PRIO_WEIGHTS, - postprocess_nstep_and_prio, -) -from ray.rllib.algorithms.dqn.dqn_torch_policy import ( - adam_optimizer, - build_q_model_and_distribution, - compute_q_values, -) -from ray.rllib.algorithms.r2d2.r2d2_tf_policy import get_distribution_inputs_and_class -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.policy_template import build_policy_class -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import LearningRateSchedule, TargetNetworkMixin -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import ( - FLOAT_MIN, - apply_grad_clipping, - concat_multi_gpu_td_errors, - huber_loss, - sequence_mask, -) -from ray.rllib.utils.typing import AlgorithmConfigDict, TensorType - -torch, nn = try_import_torch() -F = None -if nn: - F = nn.functional - - -def build_r2d2_model_and_distribution( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> Tuple[ModelV2, TorchDistributionWrapper]: - """Build q_model and target_model for DQN - - Args: - policy: The policy, which will use the model for optimization. - obs_space (gym.spaces.Space): The policy's observation space. - action_space (gym.spaces.Space): The policy's action space. - config (AlgorithmConfigDict): - - Returns: - (q_model, TorchCategorical) - Note: The target q model will not be returned, just assigned to - `policy.target_model`. - """ - - # Create the policy's models and action dist class. - model, distribution_cls = build_q_model_and_distribution( - policy, obs_space, action_space, config - ) - - # Assert correct model type by checking the init state to be present. - # For attention nets: These don't necessarily publish their init state via - # Model.get_initial_state, but may only use the trajectory view API - # (view_requirements). - assert ( - model.get_initial_state() != [] - or model.view_requirements.get("state_in_0") is not None - ), ( - "R2D2 requires its model to be a recurrent one! Try using " - "`model.use_lstm` or `model.use_attention` in your config " - "to auto-wrap your model with an LSTM- or attention net." - ) - - return model, distribution_cls - - -def r2d2_loss(policy: Policy, model, _, train_batch: SampleBatch) -> TensorType: - """Constructs the loss for R2D2TorchPolicy. - - Args: - policy: The Policy to calculate the loss for. - model (ModelV2): The Model to calculate the loss for. - train_batch: The training data. - - Returns: - TensorType: A single loss tensor. - """ - target_model = policy.target_models[model] - config = policy.config - - # Construct internal state inputs. - i = 0 - state_batches = [] - while "state_in_{}".format(i) in train_batch: - state_batches.append(train_batch["state_in_{}".format(i)]) - i += 1 - assert state_batches - - # Q-network evaluation (at t). - q, _, _, _ = compute_q_values( - policy, - model, - train_batch, - state_batches=state_batches, - seq_lens=train_batch.get(SampleBatch.SEQ_LENS), - explore=False, - is_training=True, - ) - - # Target Q-network evaluation (at t+1). - q_target, _, _, _ = compute_q_values( - policy, - target_model, - train_batch, - state_batches=state_batches, - seq_lens=train_batch.get(SampleBatch.SEQ_LENS), - explore=False, - is_training=True, - ) - - actions = train_batch[SampleBatch.ACTIONS].long() - dones = train_batch[SampleBatch.TERMINATEDS].float() - rewards = train_batch[SampleBatch.REWARDS] - weights = train_batch[PRIO_WEIGHTS] - - B = state_batches[0].shape[0] - T = q.shape[0] // B - - # Q scores for actions which we know were selected in the given state. - one_hot_selection = F.one_hot(actions, policy.action_space.n) - q_selected = torch.sum( - torch.where(q > FLOAT_MIN, q, torch.tensor(0.0, device=q.device)) - * one_hot_selection, - 1, - ) - - if config["double_q"]: - best_actions = torch.argmax(q, dim=1) - else: - best_actions = torch.argmax(q_target, dim=1) - - best_actions_one_hot = F.one_hot(best_actions, policy.action_space.n) - q_target_best = torch.sum( - torch.where( - q_target > FLOAT_MIN, q_target, torch.tensor(0.0, device=q_target.device) - ) - * best_actions_one_hot, - dim=1, - ) - - if config["num_atoms"] > 1: - raise ValueError("Distributional R2D2 not supported yet!") - else: - q_target_best_masked_tp1 = (1.0 - dones) * torch.cat( - [q_target_best[1:], torch.tensor([0.0], device=q_target_best.device)] - ) - - if config["use_h_function"]: - h_inv = h_inverse(q_target_best_masked_tp1, config["h_function_epsilon"]) - target = h_function( - rewards + config["gamma"] ** config["n_step"] * h_inv, - config["h_function_epsilon"], - ) - else: - target = ( - rewards + config["gamma"] ** config["n_step"] * q_target_best_masked_tp1 - ) - - # Seq-mask all loss-related terms. - seq_mask = sequence_mask(train_batch[SampleBatch.SEQ_LENS], T)[:, :-1] - # Mask away also the burn-in sequence at the beginning. - burn_in = policy.config["replay_buffer_config"]["replay_burn_in"] - if burn_in > 0 and burn_in < T: - seq_mask[:, :burn_in] = False - - num_valid = torch.sum(seq_mask) - - def reduce_mean_valid(t): - return torch.sum(t[seq_mask]) / num_valid - - # Make sure use the correct time indices: - # Q(t) - [gamma * r + Q^(t+1)] - q_selected = q_selected.reshape([B, T])[:, :-1] - td_error = q_selected - target.reshape([B, T])[:, :-1].detach() - td_error = td_error * seq_mask - weights = weights.reshape([B, T])[:, :-1] - total_loss = reduce_mean_valid(weights * huber_loss(td_error)) - - # Store values for stats function in model (tower), such that for - # multi-GPU, we do not override them during the parallel loss phase. - model.tower_stats["total_loss"] = total_loss - model.tower_stats["mean_q"] = reduce_mean_valid(q_selected) - model.tower_stats["min_q"] = torch.min(q_selected) - model.tower_stats["max_q"] = torch.max(q_selected) - model.tower_stats["mean_td_error"] = reduce_mean_valid(td_error) - # Store per time chunk (b/c we need only one mean - # prioritized replay weight per stored sequence). - model.tower_stats["td_error"] = torch.mean(td_error, dim=-1) - - return total_loss - - -def h_function(x, epsilon=1.0): - """h-function to normalize target Qs, described in the paper [1]. - - h(x) = sign(x) * [sqrt(abs(x) + 1) - 1] + epsilon * x - - Used in [1] in combination with h_inverse: - targets = h(r + gamma * h_inverse(Q^)) - """ - return torch.sign(x) * (torch.sqrt(torch.abs(x) + 1.0) - 1.0) + epsilon * x - - -def h_inverse(x, epsilon=1.0): - """Inverse if the above h-function, described in the paper [1]. - - If x > 0.0: - h-1(x) = [2eps * x + (2eps + 1) - sqrt(4eps x + (2eps + 1)^2)] / - (2 * eps^2) - - If x < 0.0: - h-1(x) = [2eps * x + (2eps + 1) + sqrt(-4eps x + (2eps + 1)^2)] / - (2 * eps^2) - """ - two_epsilon = epsilon * 2 - if_x_pos = ( - two_epsilon * x - + (two_epsilon + 1.0) - - torch.sqrt(4.0 * epsilon * x + (two_epsilon + 1.0) ** 2) - ) / (2.0 * epsilon**2) - if_x_neg = ( - two_epsilon * x - - (two_epsilon + 1.0) - + torch.sqrt(-4.0 * epsilon * x + (two_epsilon + 1.0) ** 2) - ) / (2.0 * epsilon**2) - return torch.where(x < 0.0, if_x_neg, if_x_pos) - - -class ComputeTDErrorMixin: - """Assign the `compute_td_error` method to the R2D2TorchPolicy - - This allows us to prioritize on the worker side. - """ - - def __init__(self): - def compute_td_error( - obs_t, act_t, rew_t, obs_tp1, terminateds_mask, importance_weights - ): - input_dict = self._lazy_tensor_dict({SampleBatch.CUR_OBS: obs_t}) - input_dict[SampleBatch.ACTIONS] = act_t - input_dict[SampleBatch.REWARDS] = rew_t - input_dict[SampleBatch.NEXT_OBS] = obs_tp1 - input_dict[SampleBatch.TERMINATEDS] = terminateds_mask - input_dict[PRIO_WEIGHTS] = importance_weights - - # Do forward pass on loss to update td error attribute - r2d2_loss(self, self.model, None, input_dict) - - return self.model.tower_stats["td_error"] - - self.compute_td_error = compute_td_error - - -def build_q_stats(policy: Policy, batch: SampleBatch) -> Dict[str, TensorType]: - - return { - "cur_lr": policy.cur_lr, - "total_loss": torch.mean(torch.stack(policy.get_tower_stats("total_loss"))), - "mean_q": torch.mean(torch.stack(policy.get_tower_stats("mean_q"))), - "min_q": torch.mean(torch.stack(policy.get_tower_stats("min_q"))), - "max_q": torch.mean(torch.stack(policy.get_tower_stats("max_q"))), - "mean_td_error": torch.mean( - torch.stack(policy.get_tower_stats("mean_td_error")) - ), - } - - -def setup_early_mixins( - policy: Policy, obs_space, action_space, config: AlgorithmConfigDict -) -> None: - LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) - - -def before_loss_init( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> None: - ComputeTDErrorMixin.__init__(policy) - TargetNetworkMixin.__init__(policy) - - -def grad_process_and_td_error_fn( - policy: Policy, optimizer: "torch.optim.Optimizer", loss: TensorType -) -> Dict[str, TensorType]: - # Clip grads if configured. - return apply_grad_clipping(policy, optimizer, loss) - - -def extra_action_out_fn( - policy: Policy, input_dict, state_batches, model, action_dist -) -> Dict[str, TensorType]: - return {"q_values": policy.q_values} - - -R2D2TorchPolicy = build_policy_class( - name="R2D2TorchPolicy", - framework="torch", - loss_fn=r2d2_loss, - get_default_config=lambda: ray.rllib.algorithms.r2d2.r2d2.R2D2Config(), - make_model_and_action_dist=build_r2d2_model_and_distribution, - action_distribution_fn=get_distribution_inputs_and_class, - stats_fn=build_q_stats, - postprocess_fn=postprocess_nstep_and_prio, - optimizer_fn=adam_optimizer, - extra_grad_process_fn=grad_process_and_td_error_fn, - extra_learn_fetches_fn=concat_multi_gpu_td_errors, - extra_action_out_fn=extra_action_out_fn, - before_init=setup_early_mixins, - before_loss_init=before_loss_init, - mixins=[ - TargetNetworkMixin, - ComputeTDErrorMixin, - LearningRateSchedule, - ], -) diff --git a/rllib_contrib/r2d2/tests/test_r2d2.py b/rllib_contrib/r2d2/tests/test_r2d2.py deleted file mode 100644 index 1a3b2ef51ee10..0000000000000 --- a/rllib_contrib/r2d2/tests/test_r2d2.py +++ /dev/null @@ -1,99 +0,0 @@ -import unittest - -from rllib_r2d2.r2d2.r2d2 import R2D2Config - -import ray -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) - -tf1, tf, tfv = try_import_tf() -torch, nn = try_import_torch() - - -def check_batch_sizes(train_results): - """Check if batch sizes are according to what we expect from config.""" - info = train_results["info"] - learner_info = info[LEARNER_INFO] - - for pid, policy_stats in learner_info.items(): - if pid == "batch_count": - continue - # Expect td-errors to be per batch-item. - configured_b = train_results["config"]["train_batch_size"] - actual_b = policy_stats["td_error"].shape[0] - if (configured_b - actual_b) / actual_b > 0.1: - # Since R2D2 learns on sequences of a fixed length but with variable - # amount of timesteps that are padded, the batch size is almost never the - # `train_batch_size`, which is specified in timesteps, but close to it. - assert 0.8 < ( - abs( - configured_b - / ( - train_results["config"]["model"]["max_seq_len"] - + train_results["config"]["replay_buffer_config"][ - "replay_burn_in" - ] - ) - / actual_b - ) - < 1.2 - ) - - -class TestR2D2(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_r2d2_compilation(self): - """Test whether R2D2 can be built on all frameworks.""" - config = ( - R2D2Config() - .environment("CartPole-v1") - .rollouts(num_rollout_workers=0) - .training( - model={ - # Wrap with an LSTM and use a very simple base-model. - "use_lstm": True, - "max_seq_len": 20, - "fcnet_hiddens": [32], - "lstm_cell_size": 64, - }, - dueling=False, - lr=5e-4, - zero_init_states=True, - replay_buffer_config={"replay_burn_in": 20}, - num_steps_sampled_before_learning_starts=0, - ) - .exploration(exploration_config={"epsilon_timesteps": 100000}) - ) - - num_iterations = 1 - - # Test building an R2D2 agent in all frameworks. - for _ in framework_iterator(config, with_eager_tracing=True): - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - check_batch_sizes(results) - print(results) - - check_compute_single_action(algo, include_state=True) - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/r2d2/tuned_examples/__init__.py b/rllib_contrib/r2d2/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/r2d2/tuned_examples/stateless-cartpole-r2d2-fake-gpus.yaml b/rllib_contrib/r2d2/tuned_examples/stateless-cartpole-r2d2-fake-gpus.yaml deleted file mode 100644 index 4beb25992cfe5..0000000000000 --- a/rllib_contrib/r2d2/tuned_examples/stateless-cartpole-r2d2-fake-gpus.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -stateless-cartpole-r2d2: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: R2D2 - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 1000000 - config: - # Works for both torch and tf. - framework: torch - num_workers: 0 - # R2D2 settings. - replay_buffer_config: - type: MultiAgentReplayBuffer - storage_unit: sequences - replay_burn_in: 20 - zero_init_states: true - #dueling: false - lr: 0.0005 - # Give some more time to explore. - exploration_config: - epsilon_timesteps: 50000 - # Wrap with an LSTM and use a very simple base-model. - model: - fcnet_hiddens: [64] - fcnet_activation: linear - use_lstm: true - lstm_cell_size: 64 - max_seq_len: 20 - - # Fake 2 GPUs. - num_gpus: 2 - _fake_gpus: true diff --git a/rllib_contrib/r2d2/tuned_examples/stateless-cartpole-r2d2.yaml b/rllib_contrib/r2d2/tuned_examples/stateless-cartpole-r2d2.yaml deleted file mode 100644 index 9b01706fc9601..0000000000000 --- a/rllib_contrib/r2d2/tuned_examples/stateless-cartpole-r2d2.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -stateless-cartpole-r2d2: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: R2D2 - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 1000000 - config: - # Works for both torch and tf. - framework: torch - num_workers: 0 - # R2D2 settings. - replay_buffer_config: - type: MultiAgentReplayBuffer - storage_unit: sequences - replay_burn_in: 20 - zero_init_states: true - #dueling: false - lr: 0.0005 - # Give some more time to explore. - exploration_config: - epsilon_timesteps: 50000 - # Wrap with an LSTM and use a very simple base-model. - model: - fcnet_hiddens: [64] - fcnet_activation: linear - use_lstm: true - lstm_cell_size: 64 - max_seq_len: 20 diff --git a/rllib_contrib/simple_q/BUILD b/rllib_contrib/simple_q/BUILD deleted file mode 100644 index cb9520c0b782d..0000000000000 --- a/rllib_contrib/simple_q/BUILD +++ /dev/null @@ -1,49 +0,0 @@ -# Examples - -py_test( - name = "example_simple_q_cartpole_v1", - main = "simple_q_cartpole_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/simple_q_cartpole_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -py_test( - name = "learning_tests_cartpole_simpleq", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib"], - size = "medium", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-simpleq.yaml"], - args = ["--dir=simple_q/tuned_examples"] -) - -py_test( - name = "learning_tests_cartpole_simpleq_fake_gpus", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "no_tf_eager_tracing"], - size = "medium", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/cartpole-simpleq-fake-gpus.yaml"], - args = ["--dir=simple_q/tuned_examples"] -) - - -# Compilation Tests - -py_test( - name = "test_simple_q", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_simple_q.py"] -) - -py_test( - name = "test_repro_simple_q", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_repro_simple_q.py"] -) diff --git a/rllib_contrib/simple_q/README.md b/rllib_contrib/simple_q/README.md deleted file mode 100644 index dd857aaa0d064..0000000000000 --- a/rllib_contrib/simple_q/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Simple Q (DQN) - -[Simple Q](https://arxiv.org/abs/1602.01783) is an implementation of the DQN algorithm without any -optimizations. - - -## Installation - -``` -conda create -n rllib-simpleq python=3.10 -conda activate rllib-simpleq -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[SimpleQ Example]() \ No newline at end of file diff --git a/rllib_contrib/simple_q/examples/simple_q_cartpole_v1.py b/rllib_contrib/simple_q/examples/simple_q_cartpole_v1.py deleted file mode 100644 index 1e3b0c59aac47..0000000000000 --- a/rllib_contrib/simple_q/examples/simple_q_cartpole_v1.py +++ /dev/null @@ -1,42 +0,0 @@ -import argparse - -from rllib_simple_q.simple_q import SimpleQ, SimpleQConfig - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = SimpleQConfig().framework("torch").environment("CartPole-v1") - - stop_reward = 150 - - tuner = tune.Tuner( - SimpleQ, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 50000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/simple_q/pyproject.toml b/rllib_contrib/simple_q/pyproject.toml deleted file mode 100644 index 99ec2fd422f45..0000000000000 --- a/rllib_contrib/simple_q/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-simpleq" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium==0.26.3", "ray[rllib]==2.5.1"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/simple_q/requirements.txt b/rllib_contrib/simple_q/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/simple_q/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/__init__.py b/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/__init__.py deleted file mode 100644 index 56cc422832b04..0000000000000 --- a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -from rllib_simple_q.simple_q.simple_q import SimpleQ, SimpleQConfig -from rllib_simple_q.simple_q.simple_q_tf_policy import ( - SimpleQTF1Policy, - SimpleQTF2Policy, -) -from rllib_simple_q.simple_q.simple_q_torch_policy import SimpleQTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = [ - "SimpleQ", - "SimpleQConfig", - "SimpleQTF1Policy", - "SimpleQTF2Policy", - "SimpleQTorchPolicy", -] - -register_trainable("rllib-contrib-simple-q", SimpleQ) diff --git a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q.py b/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q.py deleted file mode 100644 index 6b1e9b401ceb6..0000000000000 --- a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q.py +++ /dev/null @@ -1,388 +0,0 @@ -""" -Simple Q-Learning -================= - -This module provides a basic implementation of the DQN algorithm without any -optimizations. - -This file defines the distributed Algorithm class for the Simple Q algorithm. -See `simple_q_[tf|torch]_policy.py` for the definition of the policy loss. -""" - -import logging -from typing import List, Optional, Type, Union - -from rllib_simple_q.simple_q.simple_q_tf_policy import ( - SimpleQTF1Policy, - SimpleQTF2Policy, -) -from rllib_simple_q.simple_q.simple_q_torch_policy import SimpleQTorchPolicy - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample -from ray.rllib.execution.train_ops import multi_gpu_train_one_step, train_one_step -from ray.rllib.policy.policy import Policy -from ray.rllib.utils import deep_update -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE -from ray.rllib.utils.metrics import ( - LAST_TARGET_UPDATE_TS, - NUM_AGENT_STEPS_SAMPLED, - NUM_ENV_STEPS_SAMPLED, - NUM_TARGET_UPDATES, - SAMPLE_TIMER, - SYNCH_WORKER_WEIGHTS_TIMER, - TARGET_NET_UPDATE_TIMER, -) -from ray.rllib.utils.replay_buffers.utils import ( - update_priorities_in_replay_buffer, - validate_buffer_config, -) -from ray.rllib.utils.typing import ResultDict - -logger = logging.getLogger(__name__) - - -class SimpleQConfig(AlgorithmConfig): - """Defines a configuration class from which a SimpleQ Algorithm can be built. - - Example: - >>> from rllib_simple_q.simple_q import SimpleQConfig - >>> config = SimpleQConfig() - >>> print(config.replay_buffer_config) # doctest: +SKIP - >>> replay_config = config.replay_buffer_config.update( - >>> { - >>> "capacity": 40000, - >>> } - >>> ) - >>> config.training(replay_buffer_config=replay_config)\ - ... .resources(num_gpus=1)\ - ... .rollouts(num_rollout_workers=3) - - Example: - >>> from rllib_simple_q.simple_q import SimpleQConfig - >>> from ray import air - >>> from ray import tune - >>> config = SimpleQConfig() - >>> config.training(adam_epsilon=tune.grid_search([1e-8, 5e-8, 1e-7]) - >>> config.environment(env="CartPole-v1") - >>> tune.Tuner( # doctest: +SKIP - ... "SimpleQ", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict() - ... ).fit() - - Example: - >>> from rllib_simple_q.simple_q import SimpleQConfig - >>> config = SimpleQConfig() - >>> print(config.exploration_config) # doctest: +SKIP - >>> explore_config = config.exploration_config.update( - >>> { - >>> "initial_epsilon": 1.5, - >>> "final_epsilon": 0.01, - >>> "epsilon_timesteps": 5000, - >>> }) - >>> config = SimpleQConfig().rollouts(rollout_fragment_length=32)\ - >>> .exploration(exploration_config=explore_config)\ - - Example: - >>> from rllib_simple_q.simple_q import SimpleQConfig - >>> config = SimpleQConfig() - >>> print(config.exploration_config) # doctest: +SKIP - >>> explore_config = config.exploration_config.update( - >>> { - >>> "type": "softq", - >>> "temperature": [1.0], - >>> }) - >>> config = SimpleQConfig().training(lr_schedule=[[1, 1e-3], [500, 5e-3]])\ - >>> .exploration(exploration_config=explore_config) - """ - - def __init__(self, algo_class=None): - """Initializes a SimpleQConfig instance.""" - super().__init__(algo_class=algo_class or SimpleQ) - - # Simple Q specific - # fmt: off - # __sphinx_doc_begin__ - self.target_network_update_freq = 500 - self.replay_buffer_config = { - "type": "MultiAgentReplayBuffer", - "capacity": 50000, - # The number of contiguous environment steps to replay at once. This - # may be set to greater than 1 to support recurrent models. - "replay_sequence_length": 1, - } - self.num_steps_sampled_before_learning_starts = 1000 - self.store_buffer_in_checkpoints = False - self.lr_schedule = None - self.adam_epsilon = 1e-8 - - self.grad_clip = 40.0 - # Note: Only when using _enable_learner_api=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by - # global_norm, no matter the value of `grad_clip_by`. - self.grad_clip_by = "global_norm" - - self.tau = 1.0 - # __sphinx_doc_end__ - # fmt: on - - # Overrides of AlgorithmConfig defaults - # `rollouts()` - self.rollout_fragment_length = 4 - - # `training()` - self.lr = 5e-4 - self.train_batch_size = 32 - - # `exploration()` - self.exploration_config = { - "type": "EpsilonGreedy", - "initial_epsilon": 1.0, - "final_epsilon": 0.02, - "epsilon_timesteps": 10000, - } - - # `evaluation()` - self.evaluation(evaluation_config=AlgorithmConfig.overrides(explore=False)) - - # `reporting()` - self.min_time_s_per_iteration = None - self.min_sample_timesteps_per_iteration = 1000 - - # Deprecated. - self.buffer_size = DEPRECATED_VALUE - self.prioritized_replay = DEPRECATED_VALUE - self.learning_starts = DEPRECATED_VALUE - self.replay_batch_size = DEPRECATED_VALUE - # Can not use DEPRECATED_VALUE here because -1 is a common config value - self.replay_sequence_length = None - self.prioritized_replay_alpha = DEPRECATED_VALUE - self.prioritized_replay_beta = DEPRECATED_VALUE - self.prioritized_replay_eps = DEPRECATED_VALUE - - @override(AlgorithmConfig) - def training( - self, - *, - target_network_update_freq: Optional[int] = NotProvided, - replay_buffer_config: Optional[dict] = NotProvided, - store_buffer_in_checkpoints: Optional[bool] = NotProvided, - lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, - adam_epsilon: Optional[float] = NotProvided, - grad_clip: Optional[int] = NotProvided, - num_steps_sampled_before_learning_starts: Optional[int] = NotProvided, - tau: Optional[float] = NotProvided, - **kwargs, - ) -> "SimpleQConfig": - """Sets the training related configuration. - - Args: - target_network_update_freq: Update the target network every - `target_network_update_freq` sample steps. - replay_buffer_config: Replay buffer config. - Examples: - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentReplayBuffer", - "capacity": 50000, - "replay_sequence_length": 1, - } - - OR - - { - "_enable_replay_buffer_api": True, - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 50000, - "prioritized_replay_alpha": 0.6, - "prioritized_replay_beta": 0.4, - "prioritized_replay_eps": 1e-6, - "replay_sequence_length": 1, - } - - Where - - prioritized_replay_alpha: Alpha parameter controls the degree of - prioritization in the buffer. In other words, when a buffer sample has - a higher temporal-difference error, with how much more probability - should it drawn to use to update the parametrized Q-network. 0.0 - corresponds to uniform probability. Setting much above 1.0 may quickly - result as the sampling distribution could become heavily “pointy” with - low entropy. - prioritized_replay_beta: Beta parameter controls the degree of - importance sampling which suppresses the influence of gradient updates - from samples that have higher probability of being sampled via alpha - parameter and the temporal-difference error. - prioritized_replay_eps: Epsilon parameter sets the baseline probability - for sampling so that when the temporal-difference error of a sample is - zero, there is still a chance of drawing the sample. - store_buffer_in_checkpoints: Set this to True, if you want the contents of - your buffer(s) to be stored in any saved checkpoints as well. - Warnings will be created if: - - This is True AND restoring from a checkpoint that contains no buffer - data. - - This is False AND restoring from a checkpoint that does contain - buffer data. - lr_schedule: Learning rate schedule. In the format of [[timestep, value], - [timestep, value], ...]. A schedule should normally start from - timestep 0. - adam_epsilon: Adam optimizer's epsilon hyper parameter. - grad_clip: If not None, clip gradients during optimization at this value. - num_steps_sampled_before_learning_starts: Number of timesteps to collect - from rollout workers before we start sampling from replay buffers for - learning. Whether we count this in agent steps or environment steps - depends on config.multi_agent(count_steps_by=..). - tau: Update the target by \tau * policy + (1-\tau) * target_policy. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if target_network_update_freq is not NotProvided: - self.target_network_update_freq = target_network_update_freq - if replay_buffer_config is not NotProvided: - # Override entire `replay_buffer_config` if `type` key changes. - # Update, if `type` key remains the same or is not specified. - new_replay_buffer_config = deep_update( - {"replay_buffer_config": self.replay_buffer_config}, - {"replay_buffer_config": replay_buffer_config}, - False, - ["replay_buffer_config"], - ["replay_buffer_config"], - ) - self.replay_buffer_config = new_replay_buffer_config["replay_buffer_config"] - if store_buffer_in_checkpoints is not NotProvided: - self.store_buffer_in_checkpoints = store_buffer_in_checkpoints - if lr_schedule is not NotProvided: - self.lr_schedule = lr_schedule - if adam_epsilon is not NotProvided: - self.adam_epsilon = adam_epsilon - if grad_clip is not NotProvided: - self.grad_clip = grad_clip - if num_steps_sampled_before_learning_starts is not NotProvided: - self.num_steps_sampled_before_learning_starts = ( - num_steps_sampled_before_learning_starts - ) - if tau is not NotProvided: - self.tau = tau - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call super's validation method. - super().validate() - - if self.exploration_config["type"] == "ParameterNoise": - if self.batch_mode != "complete_episodes": - raise ValueError( - "ParameterNoise Exploration requires `batch_mode` to be " - "'complete_episodes'. Try setting `config.rollouts(" - "batch_mode='complete_episodes')`." - ) - - if not self.in_evaluation: - validate_buffer_config(self) - - -class SimpleQ(Algorithm): - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return SimpleQConfig() - - @classmethod - @override(Algorithm) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - return SimpleQTorchPolicy - elif config["framework"] == "tf": - return SimpleQTF1Policy - else: - return SimpleQTF2Policy - - @override(Algorithm) - def training_step(self) -> ResultDict: - """Simple Q training iteration function. - - Simple Q consists of the following steps: - - Sample n MultiAgentBatches from n workers synchronously. - - Store new samples in the replay buffer. - - Sample one training MultiAgentBatch from the replay buffer. - - Learn on the training batch. - - Update the target network every `target_network_update_freq` sample steps. - - Return all collected training metrics for the iteration. - - Returns: - The results dict from executing the training iteration. - """ - batch_size = self.config.train_batch_size - local_worker = self.workers.local_worker() - - # Sample n MultiAgentBatches from n workers. - with self._timers[SAMPLE_TIMER]: - new_sample_batches = synchronous_parallel_sample( - worker_set=self.workers, concat=False - ) - - for batch in new_sample_batches: - # Update sampling step counters. - self._counters[NUM_ENV_STEPS_SAMPLED] += batch.env_steps() - self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps() - # Store new samples in the replay buffer - self.local_replay_buffer.add(batch) - - global_vars = { - "timestep": self._counters[NUM_ENV_STEPS_SAMPLED], - } - # Update target network every `target_network_update_freq` sample steps. - cur_ts = self._counters[ - NUM_AGENT_STEPS_SAMPLED - if self.config.count_steps_by == "agent_steps" - else NUM_ENV_STEPS_SAMPLED - ] - - if cur_ts > self.config.num_steps_sampled_before_learning_starts: - # Use deprecated replay() to support old replay buffers for now - train_batch = self.local_replay_buffer.sample(batch_size) - - # Learn on the training batch. - # Use simple optimizer (only for multi-agent or tf-eager; all other - # cases should use the multi-GPU optimizer, even if only using 1 GPU) - if self.config.get("simple_optimizer") is True: - train_results = train_one_step(self, train_batch) - else: - train_results = multi_gpu_train_one_step(self, train_batch) - - # Update replay buffer priorities. - update_priorities_in_replay_buffer( - self.local_replay_buffer, - self.config, - train_batch, - train_results, - ) - - last_update = self._counters[LAST_TARGET_UPDATE_TS] - if cur_ts - last_update >= self.config.target_network_update_freq: - with self._timers[TARGET_NET_UPDATE_TIMER]: - to_update = local_worker.get_policies_to_train() - local_worker.foreach_policy_to_train( - lambda p, pid: pid in to_update and p.update_target() - ) - self._counters[NUM_TARGET_UPDATES] += 1 - self._counters[LAST_TARGET_UPDATE_TS] = cur_ts - - # Update weights and global_vars - after learning on the local worker - - # on all remote workers (only those policies that were actually trained). - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - self.workers.sync_weights( - policies=list(train_results.keys()), - global_vars=global_vars, - ) - else: - train_results = {} - - # Return all collected metrics for the iteration. - return train_results diff --git a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q_tf_policy.py b/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q_tf_policy.py deleted file mode 100644 index fc0d80033bd36..0000000000000 --- a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q_tf_policy.py +++ /dev/null @@ -1,229 +0,0 @@ -"""TensorFlow policy class used for Simple Q-Learning""" - -import logging -from typing import Dict, List, Tuple, Type, Union - -from rllib_simple_q.simple_q.utils import make_q_models - -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_action_dist import Categorical, TFActionDistribution -from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - LearningRateSchedule, - TargetNetworkMixin, - compute_gradients, -) -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import huber_loss -from ray.rllib.utils.typing import ( - LocalOptimizer, - ModelGradients, - TensorStructType, - TensorType, -) - -tf1, tf, tfv = try_import_tf() -logger = logging.getLogger(__name__) - - -# We need this builder function because we want to share the same -# custom logics between TF1 dynamic and TF2 eager policies. -def get_simple_q_tf_policy( - name: str, base: Type[Union[DynamicTFPolicyV2, EagerTFPolicyV2]] -) -> Type: - """Construct a SimpleQTFPolicy inheriting either dynamic or eager base policies. - - Args: - base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. - - Returns: - A TF Policy to be used with MAMLTrainer. - """ - - class SimpleQTFPolicy(LearningRateSchedule, TargetNetworkMixin, base): - def __init__( - self, - obs_space, - action_space, - config, - existing_model=None, - existing_inputs=None, - ): - # First thing first, enable eager execution if necessary. - base.enable_eager_execution_if_necessary() - # Initialize base class. - base.__init__( - self, - obs_space, - action_space, - config, - existing_inputs=existing_inputs, - existing_model=existing_model, - ) - - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() - - TargetNetworkMixin.__init__(self) - - @override(base) - def make_model(self) -> ModelV2: - """Builds Q-model and target Q-model for Simple Q learning.""" - model, self.target_model = make_q_models(self) - return model - - @override(base) - def action_distribution_fn( - self, - model: ModelV2, - *, - obs_batch: TensorType, - state_batches: TensorType, - **kwargs, - ) -> Tuple[TensorType, type, List[TensorType]]: - # Compute the Q-values for each possible action, using our Q-value network. - q_vals = self._compute_q_values(self.model, obs_batch, is_training=False) - return q_vals, Categorical, state_batches - - def xyz_compute_actions( - self, - *, - input_dict, - explore=True, - timestep=None, - episodes=None, - is_training=False, - **kwargs, - ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorStructType]]: - if timestep is None: - timestep = self.global_timestep - # Compute the Q-values for each possible action, using our Q-value network. - q_vals = self._compute_q_values( - self.model, input_dict[SampleBatch.OBS], is_training=is_training - ) - # Use a Categorical distribution for the exploration component. - # This way, it may either sample storchastically (e.g. when using SoftQ) - # or deterministically/greedily (e.g. when using EpsilonGreedy). - distribution = Categorical(q_vals, self.model) - # Call the exploration component's `get_exploration_action` method to - # explore, if necessary. - actions, logp = self.exploration.get_exploration_action( - action_distribution=distribution, timestep=timestep, explore=explore - ) - # Return (exploration) actions, state_outs (empty list), and extra outs. - return ( - actions, - [], - { - "q_values": q_vals, - SampleBatch.ACTION_LOGP: logp, - SampleBatch.ACTION_PROB: tf.exp(logp), - SampleBatch.ACTION_DIST_INPUTS: q_vals, - }, - ) - - @override(base) - def loss( - self, - model: Union[ModelV2, "tf.keras.Model"], - dist_class: Type[TFActionDistribution], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - # q network evaluation - q_t = self._compute_q_values(self.model, train_batch[SampleBatch.CUR_OBS]) - - # target q network evalution - q_tp1 = self._compute_q_values( - self.target_model, - train_batch[SampleBatch.NEXT_OBS], - ) - if not hasattr(self, "q_func_vars"): - self.q_func_vars = model.variables() - self.target_q_func_vars = self.target_model.variables() - - # q scores for actions which we know were selected in the given state. - one_hot_selection = tf.one_hot( - tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32), self.action_space.n - ) - q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) - - # compute estimate of best possible value starting from state at t + 1 - dones = tf.cast(train_batch[SampleBatch.TERMINATEDS], tf.float32) - q_tp1_best_one_hot_selection = tf.one_hot( - tf.argmax(q_tp1, 1), self.action_space.n - ) - q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) - q_tp1_best_masked = (1.0 - dones) * q_tp1_best - - # compute RHS of bellman equation - q_t_selected_target = ( - tf.cast(train_batch[SampleBatch.REWARDS], tf.float32) - + self.config["gamma"] * q_tp1_best_masked - ) - - # compute the error (potentially clipped) - td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) - loss = tf.reduce_mean(huber_loss(td_error)) - - # save TD error as an attribute for outside access - self.td_error = td_error - - return loss - - @override(base) - def compute_gradients_fn( - self, optimizer: LocalOptimizer, loss: TensorType - ) -> ModelGradients: - return compute_gradients(self, optimizer, loss) - - @override(base) - def extra_learn_fetches_fn(self) -> Dict[str, TensorType]: - return { - "td_error": self.td_error, - "learner_stats": {"cur_lr": self.cur_lr}, - } - - @override(base) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - """Returns the learning rate in a stats dict. - - Args: - policy: The Policy object. - train_batch: The data used for training. - - Returns: - Dict[str, TensorType]: The stats dict. - """ - - return { - "cur_lr": self.cur_lr, - } - - def _compute_q_values( - self, model: ModelV2, obs_batch: TensorType, is_training=None - ) -> TensorType: - _is_training = ( - is_training - if is_training is not None - else self._get_is_training_placeholder() - ) - model_out, _ = model( - SampleBatch(obs=obs_batch, _is_training=_is_training), [], None - ) - - return model_out - - SimpleQTFPolicy.__name__ = name - SimpleQTFPolicy.__qualname__ = name - - return SimpleQTFPolicy - - -SimpleQTF1Policy = get_simple_q_tf_policy("SimpleQTF1Policy", DynamicTFPolicyV2) -SimpleQTF2Policy = get_simple_q_tf_policy("SimpleQTF2Policy", EagerTFPolicyV2) diff --git a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q_torch_policy.py b/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q_torch_policy.py deleted file mode 100644 index 07140b6b9213d..0000000000000 --- a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/simple_q_torch_policy.py +++ /dev/null @@ -1,182 +0,0 @@ -"""PyTorch policy class used for Simple Q-Learning""" - -import logging -from typing import Any, Dict, List, Tuple, Type, Union - -from rllib_simple_q.simple_q.utils import make_q_models - -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import ( - TorchCategorical, - TorchDistributionWrapper, -) -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import LearningRateSchedule, TargetNetworkMixin -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.torch_utils import concat_multi_gpu_td_errors, huber_loss -from ray.rllib.utils.typing import TensorStructType, TensorType - -torch, nn = try_import_torch() -F = None -if nn: - F = nn.functional -logger = logging.getLogger(__name__) - - -class SimpleQTorchPolicy( - LearningRateSchedule, - TargetNetworkMixin, - TorchPolicyV2, -): - """PyTorch policy class used with SimpleQTrainer.""" - - def __init__(self, observation_space, action_space, config): - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() - - TargetNetworkMixin.__init__(self) - - @override(TorchPolicyV2) - def make_model(self) -> ModelV2: - """Builds q_model and target_model for Simple Q learning.""" - model, self.target_model = make_q_models(self) - return model - - @override(TorchPolicyV2) - def compute_actions( - self, - *, - input_dict, - explore=True, - timestep=None, - episodes=None, - is_training=False, - **kwargs - ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorStructType]]: - if timestep is None: - timestep = self.global_timestep - # Compute the Q-values for each possible action, using our Q-value network. - q_vals = self._compute_q_values( - self.model, input_dict[SampleBatch.OBS], is_training=is_training - ) - # Use a Categorical distribution for the exploration component. - # This way, it may either sample storchastically (e.g. when using SoftQ) - # or deterministically/greedily (e.g. when using EpsilonGreedy). - distribution = TorchCategorical(q_vals, self.model) - # Call the exploration component's `get_exploration_action` method to - # explore, if necessary. - actions, logp = self.exploration.get_exploration_action( - action_distribution=distribution, timestep=timestep, explore=explore - ) - # Return (exploration) actions, state_outs (empty list), and extra outs. - return ( - actions, - [], - { - "q_values": q_vals, - SampleBatch.ACTION_LOGP: logp, - SampleBatch.ACTION_PROB: torch.exp(logp), - SampleBatch.ACTION_DIST_INPUTS: q_vals, - }, - ) - - @override(TorchPolicyV2) - def loss( - self, - model: ModelV2, - dist_class: Type[TorchDistributionWrapper], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - """Compute loss for SimpleQ. - - Args: - model: The Model to calculate the loss for. - dist_class: The action distr. class. - train_batch: The training data. - - Returns: - The SimpleQ loss tensor given the input batch. - """ - target_model = self.target_models[model] - - # q network evaluation - q_t = self._compute_q_values( - model, train_batch[SampleBatch.CUR_OBS], is_training=True - ) - - # target q network evalution - q_tp1 = self._compute_q_values( - target_model, - train_batch[SampleBatch.NEXT_OBS], - is_training=True, - ) - - # q scores for actions which we know were selected in the given state. - one_hot_selection = F.one_hot( - train_batch[SampleBatch.ACTIONS].long(), self.action_space.n - ) - q_t_selected = torch.sum(q_t * one_hot_selection, 1) - - # compute estimate of best possible value starting from state at t + 1 - dones = train_batch[SampleBatch.TERMINATEDS].float() - q_tp1_best_one_hot_selection = F.one_hot( - torch.argmax(q_tp1, 1), self.action_space.n - ) - q_tp1_best = torch.sum(q_tp1 * q_tp1_best_one_hot_selection, 1) - q_tp1_best_masked = (1.0 - dones) * q_tp1_best - - # compute RHS of bellman equation - q_t_selected_target = ( - train_batch[SampleBatch.REWARDS] + self.config["gamma"] * q_tp1_best_masked - ) - - # Compute the error (Square/Huber). - td_error = q_t_selected - q_t_selected_target.detach() - loss = torch.mean(huber_loss(td_error)) - - # Store values for stats function in model (tower), such that for - # multi-GPU, we do not override them during the parallel loss phase. - model.tower_stats["loss"] = loss - # TD-error tensor in final stats - # will be concatenated and retrieved for each individual batch item. - model.tower_stats["td_error"] = td_error - - return loss - - @override(TorchPolicyV2) - def extra_compute_grad_fetches(self) -> Dict[str, Any]: - fetches = convert_to_numpy(concat_multi_gpu_td_errors(self)) - # Auto-add empty learner stats dict if needed. - return dict({LEARNER_STATS_KEY: {}}, **fetches) - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return convert_to_numpy( - { - "loss": torch.mean(torch.stack(self.get_tower_stats("loss"))), - "cur_lr": self.cur_lr, - } - ) - - def _compute_q_values( - self, model: ModelV2, obs_batch: TensorType, is_training=None - ) -> TensorType: - _is_training = is_training if is_training is not None else False - input_dict = SampleBatch(obs=obs_batch, _is_training=_is_training) - # Make sure, everything is PyTorch tensors. - model_out, _ = model(input_dict, [], None) - return model_out diff --git a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/utils.py b/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/utils.py deleted file mode 100644 index 234fd2790cf57..0000000000000 --- a/rllib_contrib/simple_q/src/rllib_simple_q/simple_q/utils.py +++ /dev/null @@ -1,34 +0,0 @@ -import gymnasium as gym - -from ray.rllib.models.catalog import ModelCatalog -from ray.rllib.utils.error import UnsupportedSpaceException - -Q_SCOPE = "q_func" -Q_TARGET_SCOPE = "target_q_func" - - -def make_q_models(policy): - if not isinstance(policy.action_space, gym.spaces.Discrete): - raise UnsupportedSpaceException( - f"Action space {policy.action_space} is not supported for DQN." - ) - - model = ModelCatalog.get_model_v2( - obs_space=policy.observation_space, - action_space=policy.action_space, - num_outputs=policy.action_space.n, - model_config=policy.config["model"], - framework=policy.config["framework"], - name=Q_SCOPE, - ) - - target_model = ModelCatalog.get_model_v2( - obs_space=policy.observation_space, - action_space=policy.action_space, - num_outputs=policy.action_space.n, - model_config=policy.config["model"], - framework=policy.config["framework"], - name=Q_TARGET_SCOPE, - ) - - return model, target_model diff --git a/rllib_contrib/simple_q/tests/test_repro_simple_q.py b/rllib_contrib/simple_q/tests/test_repro_simple_q.py deleted file mode 100644 index 65e224ac4ffac..0000000000000 --- a/rllib_contrib/simple_q/tests/test_repro_simple_q.py +++ /dev/null @@ -1,41 +0,0 @@ -import unittest - -import rllib_simple_q.simple_q.simple_q as simple_q - -import ray -from ray.rllib.examples.env.deterministic_envs import create_cartpole_deterministic -from ray.rllib.utils.test_utils import check_reproducibilty -from ray.tune import register_env - - -class TestReproSimpleQ(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_reproducibility_dqn_cartpole(self): - """Tests whether the algorithm is reproducible within 3 iterations - on discrete env cartpole.""" - - register_env("DeterministicCartPole-v1", create_cartpole_deterministic) - config = simple_q.SimpleQConfig().environment( - env="DeterministicCartPole-v1", env_config={"seed": 42} - ) - check_reproducibilty( - algo_class=simple_q.SimpleQ, - algo_config=config, - fw_kwargs={"frameworks": ("tf", "torch")}, - training_iteration=3, - ) - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/simple_q/tests/test_simple_q.py b/rllib_contrib/simple_q/tests/test_simple_q.py deleted file mode 100644 index e40e48feff108..0000000000000 --- a/rllib_contrib/simple_q/tests/test_simple_q.py +++ /dev/null @@ -1,205 +0,0 @@ -import unittest - -import numpy as np -import rllib_simple_q.simple_q.simple_q as simple_q -from rllib_simple_q.simple_q.simple_q_tf_policy import SimpleQTF2Policy -from rllib_simple_q.simple_q.simple_q_torch_policy import SimpleQTorchPolicy - -import ray -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.metrics.learner_info import ( - DEFAULT_POLICY_ID, - LEARNER_INFO, - LEARNER_STATS_KEY, -) -from ray.rllib.utils.numpy import fc, huber_loss, one_hot -from ray.rllib.utils.test_utils import ( - check, - check_compute_single_action, - check_train_results, - framework_iterator, -) - -tf1, tf, tfv = try_import_tf() - - -class TestSimpleQ(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_simple_q_compilation(self): - """Test whether SimpleQ can be built on all frameworks.""" - # Run locally and with compression - config = ( - simple_q.SimpleQConfig() - .rollouts(num_rollout_workers=0, compress_observations=True) - .training(num_steps_sampled_before_learning_starts=0) - ) - - num_iterations = 2 - - for _ in framework_iterator(config, with_eager_tracing=True): - algo = config.build(env="CartPole-v1") - rw = algo.workers.local_worker() - for i in range(num_iterations): - sb = rw.sample() - assert sb.count == config.rollout_fragment_length - results = algo.train() - check_train_results(results) - print(results) - - check_compute_single_action(algo) - - def test_simple_q_loss_function(self): - """Tests the Simple-Q loss function results on all frameworks.""" - config = simple_q.SimpleQConfig().rollouts(num_rollout_workers=0) - # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)). - config.training( - model={ - "fcnet_hiddens": [10], - "fcnet_activation": "linear", - }, - num_steps_sampled_before_learning_starts=0, - ).environment("CartPole-v1") - - for fw in framework_iterator(config): - # Generate Algorithm and get its default Policy object. - trainer = config.build() - policy = trainer.get_policy() - # Batch of size=2. - input_ = SampleBatch( - { - SampleBatch.CUR_OBS: np.random.random(size=(2, 4)), - SampleBatch.ACTIONS: np.array([0, 1]), - SampleBatch.REWARDS: np.array([0.4, -1.23]), - SampleBatch.TERMINATEDS: np.array([False, False]), - SampleBatch.NEXT_OBS: np.random.random(size=(2, 4)), - SampleBatch.EPS_ID: np.array([1234, 1234]), - SampleBatch.AGENT_INDEX: np.array([0, 0]), - SampleBatch.ACTION_LOGP: np.array([-0.1, -0.1]), - SampleBatch.ACTION_DIST_INPUTS: np.array( - [[0.1, 0.2], [-0.1, -0.2]] - ), - SampleBatch.ACTION_PROB: np.array([0.1, 0.2]), - "q_values": np.array([[0.1, 0.2], [0.2, 0.1]]), - } - ) - # Get model vars for computing expected model outs (q-vals). - # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias - vars = policy.get_weights() - if isinstance(vars, dict): - vars = list(vars.values()) - - vars_t = policy.target_model.variables() - if fw == "tf": - vars_t = policy.get_session().run(vars_t) - - # Q(s,a) outputs. - q_t = np.sum( - one_hot(input_[SampleBatch.ACTIONS], 2) - * fc( - fc( - input_[SampleBatch.CUR_OBS], - vars[0 if fw != "torch" else 2], - vars[1 if fw != "torch" else 3], - framework=fw, - ), - vars[2 if fw != "torch" else 0], - vars[3 if fw != "torch" else 1], - framework=fw, - ), - 1, - ) - # max[a'](Qtarget(s',a')) outputs. - q_target_tp1 = np.max( - fc( - fc( - input_[SampleBatch.NEXT_OBS], - vars_t[0 if fw != "torch" else 2], - vars_t[1 if fw != "torch" else 3], - framework=fw, - ), - vars_t[2 if fw != "torch" else 0], - vars_t[3 if fw != "torch" else 1], - framework=fw, - ), - 1, - ) - # TD-errors (Bellman equation). - td_error = q_t - config.gamma * input_[SampleBatch.REWARDS] + q_target_tp1 - # Huber/Square loss on TD-error. - expected_loss = huber_loss(td_error).mean() - - if fw == "torch": - input_ = policy._lazy_tensor_dict(input_) - # Get actual out and compare. - if fw == "tf": - out = policy.get_session().run( - policy._loss, - feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False), - ) - else: - out = (SimpleQTorchPolicy if fw == "torch" else SimpleQTF2Policy).loss( - policy, policy.model, None, input_ - ) - check(out, expected_loss, decimals=1) - - def test_simple_q_lr_schedule(self): - """Test PG with learning rate schedule.""" - config = simple_q.SimpleQConfig() - config.reporting( - min_sample_timesteps_per_iteration=10, - # Make sure that results contain info on default policy - min_train_timesteps_per_iteration=10, - # 0 metrics reporting delay, this makes sure timestep, - # which lr depends on, is updated after each worker rollout. - min_time_s_per_iteration=0, - ) - config.rollouts( - num_rollout_workers=1, - rollout_fragment_length=50, - ) - config.training(lr=0.2, lr_schedule=[[0, 0.2], [500, 0.001]]) - - def _step_n_times(algo, n: int): - """Step trainer n times. - - Returns: - learning rate at the end of the execution. - """ - for _ in range(n): - results = algo.train() - return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ - "cur_lr" - ] - - for _ in framework_iterator(config): - algo = config.build(env="CartPole-v1") - - lr = _step_n_times(algo, 1) # 50 timesteps - # Close to 0.2 - self.assertGreaterEqual(lr, 0.15) - - lr = _step_n_times(algo, 8) # Close to 500 timesteps - # LR Annealed to 0.001 - self.assertLessEqual(float(lr), 0.5) - - lr = _step_n_times(algo, 2) # > 500 timesteps - # LR == 0.001 - self.assertAlmostEqual(lr, 0.001) - - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/simple_q/tuned_examples/__init__.py b/rllib_contrib/simple_q/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/simple_q/tuned_examples/cartpole-simpleq-fake-gpus.yaml b/rllib_contrib/simple_q/tuned_examples/cartpole-simpleq-fake-gpus.yaml deleted file mode 100644 index e4f7234a6f4f1..0000000000000 --- a/rllib_contrib/simple_q/tuned_examples/cartpole-simpleq-fake-gpus.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-simpleq-fake-gpus: - env: CartPole-v1 - run: SimpleQ - stop: - sampler_results/episode_reward_mean: 150 - training_iteration: 400 - config: - # Works for both torch and tf. - framework: torch - - # Fake 2 GPUs. - num_gpus: 2 - _fake_gpus: true diff --git a/rllib_contrib/simple_q/tuned_examples/cartpole-simpleq.yaml b/rllib_contrib/simple_q/tuned_examples/cartpole-simpleq.yaml deleted file mode 100644 index 8c92977500844..0000000000000 --- a/rllib_contrib/simple_q/tuned_examples/cartpole-simpleq.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -cartpole-simpleq: - env: CartPole-v1 - run: SimpleQ - stop: - sampler_results/episode_reward_mean: 150 - timesteps_total: 50000 - config: - # Works for both torch and tf. - framework: torch diff --git a/rllib_contrib/slate_q/BUILD b/rllib_contrib/slate_q/BUILD deleted file mode 100644 index b615278616d2a..0000000000000 --- a/rllib_contrib/slate_q/BUILD +++ /dev/null @@ -1,32 +0,0 @@ -# Examples - -# Learning Tests - -# py_test( -# name = "learning_tests_interest_evolution_10_candidates_recsim_env_slate_q", -# main = "run_regression_tests.py", -# tags = ["team:rllib", "learning_tests", "rllib_contrib"], -# size = "large", -# srcs = ["run_regression_tests.py"], -# data = ["tuned_examples/interest-evolution-10-candidates-recsim-env-slate-q.yaml"], -# args = ["--dir=slate_q/tuned_examples/"] -# ) - -py_test( - name = "learning_tests_interest_evolution_10_candidates_recsim_env_slate_q_fake_gpus", - main = "run_regression_tests.py", - tags = ["team:rllib", "learning_tests", "rllib_contrib", "no_tf_eager_tracing"], - size = "large", - srcs = ["run_regression_tests.py"], - data = ["tuned_examples/interest-evolution-10-candidates-recsim-env-slate-q.yaml"], - args = ["--dir=slate_q/tuned_examples/"] -) - -# Compilation Tests - -py_test( - name = "test_slate_q", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_slate_q.py"] -) diff --git a/rllib_contrib/slate_q/README.md b/rllib_contrib/slate_q/README.md deleted file mode 100644 index 2bbda7c30d66a..0000000000000 --- a/rllib_contrib/slate_q/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# SlateQ (Asynchronous Advantage Actor-Critic) - -[SlateQ](https://storage.googleapis.com/pub-tools-public-publication-data/pdf/9f91de1fa0ac351ecb12e4062a37afb896aa1463.pdf) is a model-free RL method that builds on top of DQN and generates recommendation slates for recommender system environments. Since these types of environments come with large combinatorial action spaces, SlateQ mitigates this by decomposing the Q-value into single-item Q-values and solves the decomposed objective via mixing integer programming and deep learning optimization. SlateQ can be evaluated on Google’s RecSim environment. - - -## Installation - -``` -conda create -n rllib-slateq python=3.10 -conda activate rllib-slateq -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[SlateQ Example]() \ No newline at end of file diff --git a/rllib_contrib/slate_q/examples/recommender_system_with_recsim_and_slateq.py b/rllib_contrib/slate_q/examples/recommender_system_with_recsim_and_slateq.py deleted file mode 100644 index 91edf59c370a1..0000000000000 --- a/rllib_contrib/slate_q/examples/recommender_system_with_recsim_and_slateq.py +++ /dev/null @@ -1,163 +0,0 @@ -"""Using an RLlib-ready RecSim environment and the SlateQ algorithm -for solving recommendation system problems. - -This example supports three different RecSim (RLlib-ready) environments, -configured via the --env option: -- "long-term-satisfaction" -- "interest-exploration" -- "interest-evolution" -""" - -import argparse - -import numpy as np -from rllib_slate_q.slate_q import SlateQ, SlateQConfig -from scipy.stats import sem - -import ray -from ray import air, tune -from ray.rllib.examples.env.recommender_system_envs_with_recsim import ( - InterestEvolutionRecSimEnv, - InterestExplorationRecSimEnv, - LongTermSatisfactionRecSimEnv, -) - -parser = argparse.ArgumentParser() -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--env", - type=str, - default="interest-evolution", - choices=["interest-evolution", "interest-exploration", "long-term-satisfaction"], - help=("Select the RecSim env to use."), -) - -parser.add_argument( - "--random-test-episodes", - type=int, - default=0, - help="The number of test episodes to run with a random agent to figure out " - "up front what the random baseline reward is.", -) - -parser.add_argument("--tune-num-samples", type=int, default=1) - -parser.add_argument( - "--env-num-candidates", - type=int, - default=100, - help="The number of candidates that the agent has to pick " - "`--env-slate-size` from each timestep. These candidates will be " - "sampled by the environment's built-in document sampler model.", -) - -parser.add_argument( - "--num-steps-sampled-before-learning_starts", - type=int, - default=20000, - help="Number of timesteps to collect from rollout workers before we start " - "sampling from replay buffers for learning..", -) - -parser.add_argument( - "--env-slate-size", - type=int, - default=2, - help="The size of the slate to recommend (from out of " - "`--env-num-candidates` sampled docs) each timestep.", -) -parser.add_argument( - "--env-dont-resample-documents", - action="store_true", - help="Whether to NOT resample `--env-num-candidates` docs " - "each timestep. If set, the env will only sample `--env-num-candidates`" - " once at the beginning and the agent always has to pick " - "`--env-slate-size` docs from this sample.", -) - -parser.add_argument("--run-as-test", action="store_true") - - -def main(): - args = parser.parse_args() - ray.init() - - env_config = { - "num_candidates": args.env_num_candidates, - "resample_documents": not args.env_dont_resample_documents, - "slate_size": args.env_slate_size, - "seed": 0, - "convert_to_discrete_action_space": False, - } - - config = ( - SlateQConfig() - .environment( - InterestEvolutionRecSimEnv - if args.env == "interest-evolution" - else InterestExplorationRecSimEnv - if args.env == "interest-exploration" - else LongTermSatisfactionRecSimEnv, - env_config=env_config, - ) - .framework(args.framework) - .rollouts(num_rollout_workers=7) - .resources() - ) - - config.num_steps_sampled_before_learning_starts = ( - args.num_steps_sampled_before_learning_starts - ) - - # Perform a test run on the env with a random agent to see, what - # the random baseline reward is. - if args.random_test_episodes: - print( - f"Running {args.random_test_episodes} episodes to get a random " - "agent's baseline reward ..." - ) - env = config["env"](config=env_config) - env.reset() - num_episodes = 0 - episode_rewards = [] - episode_reward = 0.0 - while num_episodes < args.random_test_episodes: - action = env.action_space.sample() - _, r, d, _, _ = env.step(action) - episode_reward += r - if d: - num_episodes += 1 - episode_rewards.append(episode_reward) - episode_reward = 0.0 - env.reset() - print( - f"Ran {args.random_test_episodes} episodes with a random agent " - "reaching a mean episode return of " - f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}." - ) - - if args.run_as_test: - stop = {"training_iteration": 1} - else: - stop = { - "training_iteration": 200, - "timesteps_total": 150000, - "episode_reward_mean": 160, - } - - tune.Tuner( - SlateQ, - run_config=air.RunConfig( - stop=stop, - ), - param_space=config, - ).fit() - - -if __name__ == "__main__": - main() diff --git a/rllib_contrib/slate_q/pyproject.toml b/rllib_contrib/slate_q/pyproject.toml deleted file mode 100644 index d0f18c25b094b..0000000000000 --- a/rllib_contrib/slate_q/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-slate-q" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gym==0.26.2", "recsim==0.2.4", "gymnasium[mujoco]==0.26.3", "ray[rllib]==2.5.1"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "tensorflow-probability==0.19.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/slate_q/requirements.txt b/rllib_contrib/slate_q/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/slate_q/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/env/__init__.py b/rllib_contrib/slate_q/src/rllib_slate_q/env/__init__.py deleted file mode 100644 index d6bec1e9743d0..0000000000000 --- a/rllib_contrib/slate_q/src/rllib_slate_q/env/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from rllib_slate_q.env.recommender_system_envs_with_recsim import ( - InterestEvolutionRecSimEnv, - InterestExplorationRecSimEnv, - LongTermSatisfactionRecSimEnv, -) - -__all__ = [ - "InterestExplorationRecSimEnv", - "LongTermSatisfactionRecSimEnv", - "InterestEvolutionRecSimEnv", -] diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/env/recommender_system_envs_with_recsim.py b/rllib_contrib/slate_q/src/rllib_slate_q/env/recommender_system_envs_with_recsim.py deleted file mode 100644 index 86c5d391097ee..0000000000000 --- a/rllib_contrib/slate_q/src/rllib_slate_q/env/recommender_system_envs_with_recsim.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Examples for RecSim envs ready to be used by RLlib Trainers - -RecSim is a configurable recommender systems simulation platform. -Source: https://github.com/google-research/recsim -""" - -from recsim import choice_model -from recsim.environments import interest_evolution as iev -from recsim.environments import interest_exploration as iex -from recsim.environments import long_term_satisfaction as lts - -from ray.rllib.env.wrappers.recsim import make_recsim_env -from ray.tune import register_env - -# Some built-in RecSim envs to test with. -# --------------------------------------- - -# Long-term satisfaction env: User has to pick from items that are either -# a) unhealthy, but taste good, or b) healthy, but have bad taste. -# Best strategy is to pick a mix of both to ensure long-term -# engagement. - - -def lts_user_model_creator(env_ctx): - return lts.LTSUserModel( - env_ctx["slate_size"], - user_state_ctor=lts.LTSUserState, - response_model_ctor=lts.LTSResponse, - ) - - -def lts_document_sampler_creator(env_ctx): - return lts.LTSDocumentSampler() - - -LongTermSatisfactionRecSimEnv = make_recsim_env( - recsim_user_model_creator=lts_user_model_creator, - recsim_document_sampler_creator=lts_document_sampler_creator, - reward_aggregator=lts.clicked_engagement_reward, -) - - -# Interest exploration env: Models the problem of active exploration -# of user interests. It is meant to illustrate popularity bias in -# recommender systems, where myopic maximization of engagement leads -# to bias towards documents that have wider appeal, -# whereas niche user interests remain unexplored. -def iex_user_model_creator(env_ctx): - return iex.IEUserModel( - env_ctx["slate_size"], - user_state_ctor=iex.IEUserState, - response_model_ctor=iex.IEResponse, - seed=env_ctx["seed"], - ) - - -def iex_document_sampler_creator(env_ctx): - return iex.IETopicDocumentSampler(seed=env_ctx["seed"]) - - -InterestExplorationRecSimEnv = make_recsim_env( - recsim_user_model_creator=iex_user_model_creator, - recsim_document_sampler_creator=iex_document_sampler_creator, - reward_aggregator=iex.total_clicks_reward, -) - - -# Interest evolution env: See https://github.com/google-research/recsim -# for more information. -def iev_user_model_creator(env_ctx): - return iev.IEvUserModel( - env_ctx["slate_size"], - choice_model_ctor=choice_model.MultinomialProportionalChoiceModel, - response_model_ctor=iev.IEvResponse, - user_state_ctor=iev.IEvUserState, - seed=env_ctx["seed"], - ) - - -# Extend IEvVideo to fix a bug caused by None cluster_ids. -class SingleClusterIEvVideo(iev.IEvVideo): - def __init__(self, doc_id, features, video_length=None, quality=None): - super(SingleClusterIEvVideo, self).__init__( - doc_id=doc_id, - features=features, - cluster_id=0, # single cluster. - video_length=video_length, - quality=quality, - ) - - -def iev_document_sampler_creator(env_ctx): - return iev.UtilityModelVideoSampler(doc_ctor=iev.IEvVideo, seed=env_ctx["seed"]) - - -InterestEvolutionRecSimEnv = make_recsim_env( - recsim_user_model_creator=iev_user_model_creator, - recsim_document_sampler_creator=iev_document_sampler_creator, - reward_aggregator=iev.clicked_watchtime_reward, -) - - -# Backward compatibility. -register_env( - name="RecSim-v1", env_creator=lambda env_ctx: InterestEvolutionRecSimEnv(env_ctx) -) diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/__init__.py b/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/__init__.py deleted file mode 100644 index 38bfd972e8804..0000000000000 --- a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from rllib_slate_q.slate_q.slateq import SlateQ, SlateQConfig -from rllib_slate_q.slate_q.slateq_tf_policy import SlateQTFPolicy -from rllib_slate_q.slate_q.slateq_torch_policy import SlateQTorchPolicy - -from ray.tune.registry import register_trainable - -__all__ = ["SlateQConfig", "SlateQ", "SlateQTFPolicy", "SlateQTorchPolicy"] - -register_trainable("rllib-contrib-slate-q", SlateQ) diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq.py b/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq.py deleted file mode 100644 index 66b70768e9964..0000000000000 --- a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq.py +++ /dev/null @@ -1,244 +0,0 @@ -""" -SlateQ (Reinforcement Learning for Recommendation) -================================================== - -This file defines the algorithm class for the SlateQ algorithm from the -`"Reinforcement Learning for Slate-based Recommender Systems: A Tractable -Decomposition and Practical Methodology" `_ -paper. - -See `slateq_torch_policy.py` for the definition of the policy. Currently, only -PyTorch is supported. The algorithm is written and tested for Google's RecSim -environment (https://github.com/google-research/recsim). -""" - -import logging -from typing import Any, Dict, List, Optional, Type, Union - -from rllib_slate_q.slate_q.slateq_tf_policy import SlateQTFPolicy -from rllib_slate_q.slate_q.slateq_torch_policy import SlateQTorchPolicy - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.dqn.dqn import DQN -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE - -logger = logging.getLogger(__name__) - - -class SlateQConfig(AlgorithmConfig): - """Defines a configuration class from which a SlateQ Algorithm can be built. - - Example: - >>> from rllib_slate_q.slate_q import SlateQConfig - >>> config = SlateQConfig().training(lr=0.01).resources(num_gpus=1) - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from rllib_slate_q.slate_q import SlateQConfig - >>> from ray import air - >>> from ray import tune - >>> config = SlateQConfig() - >>> # Print out some default values. - >>> print(config.lr) # doctest: +SKIP - >>> # Update the config object. - >>> config = config.training( # doctest: +SKIP - ... lr=tune.grid_search([0.001, 0.0001])) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "SlateQ", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 160.0}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self): - """Initializes a PGConfig instance.""" - super().__init__(algo_class=SlateQ) - - # fmt: off - # __sphinx_doc_begin__ - - # SlateQ specific settings: - self.fcnet_hiddens_per_candidate = [256, 32] - self.target_network_update_freq = 3200 - self.tau = 1.0 - self.use_huber = False - self.huber_threshold = 1.0 - self.training_intensity = None - self.lr_schedule = None - self.lr_choice_model = 1e-3 - self.rmsprop_epsilon = 1e-5 - self.grad_clip = None - self.n_step = 1 - self.replay_buffer_config = { - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": 100000, - "prioritized_replay_alpha": 0.6, - # Beta parameter for sampling from prioritized replay buffer. - "prioritized_replay_beta": 0.4, - # Epsilon to add to the TD errors when updating priorities. - "prioritized_replay_eps": 1e-6, - # The number of continuous environment steps to replay at once. This may - # be set to greater than 1 to support recurrent models. - "replay_sequence_length": 1, - # Whether to compute priorities on workers. - "worker_side_prioritization": False, - } - # Number of timesteps to collect from rollout workers before we start - # sampling from replay buffers for learning. Whether we count this in agent - # steps or environment steps depends on config.multi_agent(count_steps_by=..). - self.num_steps_sampled_before_learning_starts = 20000 - - # Override some of AlgorithmConfig's default values with SlateQ-specific values. - self.exploration_config = { - # The Exploration class to use. - # Must be SlateEpsilonGreedy or SlateSoftQ to handle the problem that - # the action space of the policy is different from the space used inside - # the exploration component. - # E.g.: action_space=MultiDiscrete([5, 5]) <- slate-size=2, num-docs=5, - # but action distribution is Categorical(5*4) -> all possible unique slates. - "type": "SlateEpsilonGreedy", - "warmup_timesteps": 20000, - "epsilon_timesteps": 250000, - "final_epsilon": 0.01, - } - # Switch to greedy actions in evaluation workers. - self.evaluation_config = {"explore": False} - self.rollout_fragment_length = 4 - self.train_batch_size = 32 - self.lr = 0.00025 - self.min_sample_timesteps_per_iteration = 1000 - self.min_time_s_per_iteration = 1 - self.compress_observations = False - self._disable_preprocessor_api = True - # Switch to greedy actions in evaluation workers. - self.evaluation(evaluation_config=AlgorithmConfig.overrides(explore=False)) - # __sphinx_doc_end__ - # fmt: on - - # Deprecated config keys. - self.learning_starts = DEPRECATED_VALUE - - @override(AlgorithmConfig) - def training( - self, - *, - replay_buffer_config: Optional[Dict[str, Any]] = NotProvided, - fcnet_hiddens_per_candidate: Optional[List[int]] = NotProvided, - target_network_update_freq: Optional[int] = NotProvided, - tau: Optional[float] = NotProvided, - use_huber: Optional[bool] = NotProvided, - huber_threshold: Optional[float] = NotProvided, - training_intensity: Optional[float] = NotProvided, - lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, - lr_choice_model: Optional[bool] = NotProvided, - rmsprop_epsilon: Optional[float] = NotProvided, - grad_clip: Optional[float] = NotProvided, - n_step: Optional[int] = NotProvided, - num_steps_sampled_before_learning_starts: Optional[int] = NotProvided, - **kwargs, - ) -> "SlateQConfig": - """Sets the training related configuration. - - Args: - replay_buffer_config: The config dict to specify the replay buffer used. - May contain a `type` key (default: `MultiAgentPrioritizedReplayBuffer`) - indicating the class being used. All other keys specify the names - and values of kwargs passed to to this class' constructor. - fcnet_hiddens_per_candidate: Dense-layer setup for each the n (document) - candidate Q-network stacks. - target_network_update_freq: Update the target network every - `target_network_update_freq` sample steps. - tau: Update the target by \tau * policy + (1-\tau) * target_policy. - use_huber: If True, use huber loss instead of squared loss for critic - network. Conventionally, no need to clip gradients if using a huber - loss. - huber_threshold: The threshold for the Huber loss. - training_intensity: If set, this will fix the ratio of replayed from a - buffer and learned on timesteps to sampled from an environment and - stored in the replay buffer timesteps. Otherwise, the replay will - proceed at the native ratio determined by - `(train_batch_size / rollout_fragment_length)`. - lr_schedule: Learning rate schedule. In the format of - [[timestep, lr-value], [timestep, lr-value], ...] - Intermediary timesteps will be assigned to interpolated learning rate - values. A schedule should normally start from timestep 0. - lr_choice_model: Learning rate for adam optimizer for the user choice model. - So far, only relevant/supported for framework=torch. - rmsprop_epsilon: RMSProp epsilon hyperparameter. - grad_clip: If not None, clip gradients during optimization at this value. - n_step: N-step parameter for Q-learning. - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if replay_buffer_config is not NotProvided: - self.replay_buffer_config.update(replay_buffer_config) - if fcnet_hiddens_per_candidate is not NotProvided: - self.fcnet_hiddens_per_candidate = fcnet_hiddens_per_candidate - if target_network_update_freq is not NotProvided: - self.target_network_update_freq = target_network_update_freq - if tau is not NotProvided: - self.tau = tau - if use_huber is not NotProvided: - self.use_huber = use_huber - if huber_threshold is not NotProvided: - self.huber_threshold = huber_threshold - if training_intensity is not NotProvided: - self.training_intensity = training_intensity - if lr_schedule is not NotProvided: - self.lr_schedule = lr_schedule - if lr_choice_model is not NotProvided: - self.lr_choice_model = lr_choice_model - if rmsprop_epsilon is not NotProvided: - self.rmsprop_epsilon = rmsprop_epsilon - if grad_clip is not NotProvided: - self.grad_clip = grad_clip - if n_step is not NotProvided: - self.n_step = n_step - if num_steps_sampled_before_learning_starts is not NotProvided: - self.num_steps_sampled_before_learning_starts = ( - num_steps_sampled_before_learning_starts - ) - - return self - - -def calculate_round_robin_weights(config: AlgorithmConfig) -> List[float]: - """Calculate the round robin weights for the rollout and train steps""" - if not config["training_intensity"]: - return [1, 1] - # e.g., 32 / 4 -> native ratio of 8.0 - native_ratio = config["train_batch_size"] / config["rollout_fragment_length"] - # Training intensity is specified in terms of - # (steps_replayed / steps_sampled), so adjust for the native ratio. - weights = [1, config["training_intensity"] / native_ratio] - return weights - - -class SlateQ(DQN): - @classmethod - @override(DQN) - def get_default_config(cls) -> AlgorithmConfig: - return SlateQConfig() - - @classmethod - @override(DQN) - def get_default_policy_class( - cls, config: AlgorithmConfig - ) -> Optional[Type[Policy]]: - if config["framework"] == "torch": - return SlateQTorchPolicy - else: - return SlateQTFPolicy diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_tf_model.py b/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_tf_model.py deleted file mode 100644 index d8afd2161069b..0000000000000 --- a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_tf_model.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Tensorflow model for SlateQ""" - -from typing import List - -import gymnasium as gym - -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.typing import ModelConfigDict, TensorType - -tf1, tf, tfv = try_import_tf() - - -class SlateQTFModel(TFModelV2): - def __init__( - self, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - num_outputs: int, - model_config: ModelConfigDict, - name: str, - fcnet_hiddens_per_candidate=(256, 32), - ): - """Initializes a SlateQTFModel instance. - - Each document candidate receives one full Q-value stack, defined by - `fcnet_hiddens_per_candidate`. The input to each of these Q-value stacks - is always {[user] concat [document[i]] for i in document_candidates}. - - Extra model kwargs: - fcnet_hiddens_per_candidate: List of layer-sizes for each(!) of the - candidate documents. - """ - super(SlateQTFModel, self).__init__( - obs_space, action_space, None, model_config, name - ) - - self.embedding_size = self.obs_space["doc"]["0"].shape[0] - self.num_candidates = len(self.obs_space["doc"]) - assert self.obs_space["user"].shape[0] == self.embedding_size - - # Setup the Q head output (i.e., model for get_q_values) - self.user_in = tf.keras.layers.Input( - shape=(self.embedding_size,), name="user_in" - ) - self.docs_in = tf.keras.layers.Input( - shape=(self.embedding_size * self.num_candidates,), name="docs_in" - ) - - self.num_outputs = num_outputs - - q_outs = [] - for i in range(self.num_candidates): - doc = self.docs_in[ - :, self.embedding_size * i : self.embedding_size * (i + 1) - ] - out = tf.keras.layers.concatenate([self.user_in, doc], axis=1) - for h in fcnet_hiddens_per_candidate: - out = tf.keras.layers.Dense(h, activation=tf.nn.relu)(out) - q_value = tf.keras.layers.Dense(1, name=f"q_value_{i}")(out) - q_outs.append(q_value) - q_outs = tf.concat(q_outs, axis=1) - - self.q_value_head = tf.keras.Model([self.user_in, self.docs_in], q_outs) - - def get_q_values(self, user: TensorType, docs: List[TensorType]) -> TensorType: - """Returns Q-values, 1 for each candidate document, given user and doc tensors. - - Args: - user: [B x u] where u=embedding of user features. - docs: List[[B x d]] where d=embedding of doc features. Each item in the - list represents one document candidate. - - Returns: - Tensor ([batch, num candidates) of Q-values. - 1 Q-value per document candidate. - """ - return self.q_value_head([user, tf.concat(docs, 1)]) diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_tf_policy.py b/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_tf_policy.py deleted file mode 100644 index c103be490f9b8..0000000000000 --- a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_tf_policy.py +++ /dev/null @@ -1,372 +0,0 @@ -"""TensorFlow policy class used for SlateQ.""" - -import functools -import logging -from typing import Dict - -import gymnasium as gym -import numpy as np -from rllib_slate_q.slate_q.slateq_tf_model import SlateQTFModel - -from ray.rllib.algorithms.dqn.dqn_tf_policy import clip_gradients -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_action_dist import SlateMultiCategorical -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import LearningRateSchedule, TargetNetworkMixin -from ray.rllib.policy.tf_policy_template import build_tf_policy -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import huber_loss -from ray.rllib.utils.typing import AlgorithmConfigDict, TensorType - -tf1, tf, tfv = try_import_tf() -logger = logging.getLogger(__name__) - - -def build_slateq_model( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> SlateQTFModel: - """Build models for the SlateQTFPolicy. - - Args: - policy: The policy, which will use the model for optimization. - obs_space: The policy's observation space. - action_space: The policy's action space. - config: The Algorithm's config dict. - - Returns: - The slate-Q specific Q-model instance. - """ - model = SlateQTFModel( - obs_space, - action_space, - num_outputs=action_space.nvec[0], - model_config=config["model"], - name="slateq_model", - fcnet_hiddens_per_candidate=config["fcnet_hiddens_per_candidate"], - ) - - policy.target_model = SlateQTFModel( - obs_space, - action_space, - num_outputs=action_space.nvec[0], - model_config=config["model"], - name="target_slateq_model", - fcnet_hiddens_per_candidate=config["fcnet_hiddens_per_candidate"], - ) - - return model - - -def build_slateq_losses( - policy: Policy, - model: ModelV2, - _, - train_batch: SampleBatch, -) -> TensorType: - """Constructs the choice- and Q-value losses for the SlateQTorchPolicy. - - Args: - policy: The Policy to calculate the loss for. - model: The Model to calculate the loss for. - train_batch: The training data. - - Returns: - The Q-value loss tensor. - """ - - # B=batch size - # S=slate size - # C=num candidates - # E=embedding size - # A=number of all possible slates - - # Q-value computations. - # --------------------- - observation = train_batch[SampleBatch.OBS] - # user.shape: [B, E] - user_obs = observation["user"] - batch_size = tf.shape(user_obs)[0] - # doc.shape: [B, C, E] - doc_obs = list(observation["doc"].values()) - # action.shape: [B, S] - actions = train_batch[SampleBatch.ACTIONS] - - # click_indicator.shape: [B, S] - click_indicator = tf.cast( - tf.stack([k["click"] for k in observation["response"]], 1), tf.float32 - ) - # item_reward.shape: [B, S] - item_reward = tf.stack([k["watch_time"] for k in observation["response"]], 1) - # q_values.shape: [B, C] - q_values = model.get_q_values(user_obs, doc_obs) - # slate_q_values.shape: [B, S] - slate_q_values = tf.gather( - q_values, tf.cast(actions, dtype=tf.int32), batch_dims=-1 - ) - # Only get the Q from the clicked document. - # replay_click_q.shape: [B] - replay_click_q = tf.reduce_sum( - input_tensor=slate_q_values * click_indicator, axis=1, name="replay_click_q" - ) - - # Target computations. - # -------------------- - next_obs = train_batch[SampleBatch.NEXT_OBS] - - # user.shape: [B, E] - user_next_obs = next_obs["user"] - # doc.shape: [B, C, E] - doc_next_obs = list(next_obs["doc"].values()) - # Only compute the watch time reward of the clicked item. - reward = tf.reduce_sum(input_tensor=item_reward * click_indicator, axis=1) - - # TODO: Find out, whether it's correct here to use obs, not next_obs! - # Dopamine uses obs, then next_obs only for the score. - # next_q_values = policy.target_model.get_q_values(user_next_obs, doc_next_obs) - next_q_values = policy.target_model.get_q_values(user_obs, doc_obs) - scores, score_no_click = score_documents(user_next_obs, doc_next_obs) - - # next_q_values_slate.shape: [B, A, S] - next_q_values_slate = tf.gather(next_q_values, policy.slates, axis=1) - # scores_slate.shape [B, A, S] - scores_slate = tf.gather(scores, policy.slates, axis=1) - # score_no_click_slate.shape: [B, A] - score_no_click_slate = tf.reshape( - tf.tile(score_no_click, tf.shape(input=policy.slates)[:1]), [batch_size, -1] - ) - - # next_q_target_slate.shape: [B, A] - next_q_target_slate = tf.reduce_sum( - input_tensor=next_q_values_slate * scores_slate, axis=2 - ) / (tf.reduce_sum(input_tensor=scores_slate, axis=2) + score_no_click_slate) - next_q_target_max = tf.reduce_max(input_tensor=next_q_target_slate, axis=1) - - target = reward + policy.config["gamma"] * next_q_target_max * ( - 1.0 - tf.cast(train_batch[SampleBatch.TERMINATEDS], tf.float32) - ) - target = tf.stop_gradient(target) - - clicked = tf.reduce_sum(input_tensor=click_indicator, axis=1) - clicked_indices = tf.squeeze(tf.where(tf.equal(clicked, 1)), axis=1) - # Clicked_indices is a vector and tf.gather selects the batch dimension. - q_clicked = tf.gather(replay_click_q, clicked_indices) - target_clicked = tf.gather(target, clicked_indices) - - td_error = tf.where( - tf.cast(clicked, tf.bool), - replay_click_q - target, - tf.zeros_like(train_batch[SampleBatch.REWARDS]), - ) - if policy.config["use_huber"]: - loss = huber_loss(td_error, delta=policy.config["huber_threshold"]) - else: - loss = tf.math.square(td_error) - loss = tf.reduce_mean(loss) - td_error = tf.abs(td_error) - mean_td_error = tf.reduce_mean(td_error) - - policy._q_values = tf.reduce_mean(q_values) - policy._q_clicked = tf.reduce_mean(q_clicked) - policy._scores = tf.reduce_mean(scores) - policy._score_no_click = tf.reduce_mean(score_no_click) - policy._slate_q_values = tf.reduce_mean(slate_q_values) - policy._replay_click_q = tf.reduce_mean(replay_click_q) - policy._bellman_reward = tf.reduce_mean(reward) - policy._next_q_values = tf.reduce_mean(next_q_values) - policy._target = tf.reduce_mean(target) - policy._next_q_target_slate = tf.reduce_mean(next_q_target_slate) - policy._next_q_target_max = tf.reduce_mean(next_q_target_max) - policy._target_clicked = tf.reduce_mean(target_clicked) - policy._q_loss = loss - policy._td_error = td_error - policy._mean_td_error = mean_td_error - policy._mean_actions = tf.reduce_mean(actions) - - return loss - - -def build_slateq_stats(policy: Policy, batch) -> Dict[str, TensorType]: - stats = { - "q_values": policy._q_values, - "q_clicked": policy._q_clicked, - "scores": policy._scores, - "score_no_click": policy._score_no_click, - "slate_q_values": policy._slate_q_values, - "replay_click_q": policy._replay_click_q, - "bellman_reward": policy._bellman_reward, - "next_q_values": policy._next_q_values, - "target": policy._target, - "next_q_target_slate": policy._next_q_target_slate, - "next_q_target_max": policy._next_q_target_max, - "target_clicked": policy._target_clicked, - "mean_td_error": policy._mean_td_error, - "q_loss": policy._q_loss, - "mean_actions": policy._mean_actions, - } - return stats - - -def action_distribution_fn( - policy: Policy, model: SlateQTFModel, input_dict, *, explore, is_training, **kwargs -): - """Determine which action to take.""" - - # First, we transform the observation into its unflattened form. - observation = input_dict[SampleBatch.OBS] - # user.shape: [B, E] - user_obs = observation["user"] - doc_obs = list(observation["doc"].values()) - - # Compute scores per candidate. - scores, score_no_click = score_documents(user_obs, doc_obs) - # Compute Q-values per candidate. - q_values = model.get_q_values(user_obs, doc_obs) - - with tf.name_scope("select_slate"): - per_slate_q_values = get_per_slate_q_values( - policy.slates, score_no_click, scores, q_values - ) - return ( - per_slate_q_values, - functools.partial( - SlateMultiCategorical, - action_space=policy.action_space, - all_slates=policy.slates, - ), - [], - ) - - -def get_per_slate_q_values(slates, s_no_click, s, q): - slate_q_values = tf.gather(s * q, slates, axis=1) - slate_scores = tf.gather(s, slates, axis=1) - slate_normalizer = tf.reduce_sum( - input_tensor=slate_scores, axis=2 - ) + tf.expand_dims(s_no_click, 1) - - slate_q_values = slate_q_values / tf.expand_dims(slate_normalizer, 2) - slate_sum_q_values = tf.reduce_sum(input_tensor=slate_q_values, axis=2) - return slate_sum_q_values - - -def score_documents( - user_obs, doc_obs, no_click_score=1.0, multinomial_logits=False, min_normalizer=-1.0 -): - """Computes dot-product scores for user vs doc (plus no-click) feature vectors.""" - - # Dot product between used and each document feature vector. - scores_per_candidate = tf.reduce_sum( - tf.multiply(tf.expand_dims(user_obs, 1), tf.stack(doc_obs, axis=1)), 2 - ) - # Compile a constant no-click score tensor. - score_no_click = tf.fill([tf.shape(user_obs)[0], 1], no_click_score) - # Concatenate click and no-click scores. - all_scores = tf.concat([scores_per_candidate, score_no_click], axis=1) - - # Logits: Softmax to yield probabilities. - if multinomial_logits: - all_scores = tf.nn.softmax(all_scores) - # Multinomial proportional model: Shift to `[0.0,..[`. - else: - all_scores = all_scores - min_normalizer - - # Return click (per candidate document) and no-click scores. - return all_scores[:, :-1], all_scores[:, -1] - - -def setup_early(policy, obs_space, action_space, config): - """Obtain all possible slates given current docs in the candidate set.""" - - num_candidates = action_space.nvec[0] - slate_size = len(action_space.nvec) - num_all_slates = np.prod([(num_candidates - i) for i in range(slate_size)]) - - mesh_args = [list(range(num_candidates))] * slate_size - slates = tf.stack(tf.meshgrid(*mesh_args), axis=-1) - slates = tf.reshape(slates, shape=(-1, slate_size)) - # Filter slates that include duplicates to ensure each document is picked - # at most once. - unique_mask = tf.map_fn( - lambda x: tf.equal(tf.size(input=x), tf.size(input=tf.unique(x)[0])), - slates, - dtype=tf.bool, - ) - # slates.shape: [A, S] - slates = tf.boolean_mask(tensor=slates, mask=unique_mask) - slates.set_shape([num_all_slates, slate_size]) - - # Store all possible slates only once in policy object. - policy.slates = slates - - -def setup_mid_mixins(policy: Policy, obs_space, action_space, config) -> None: - """Call mixin classes' constructors before SlateQTorchPolicy loss initialization. - - Args: - policy: The Policy object. - obs_space: The Policy's observation space. - action_space: The Policy's action space. - config: The Policy's config. - """ - LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) - - -def setup_late_mixins( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> None: - """Call mixin classes' constructors after SlateQTorchPolicy loss initialization. - - Args: - policy: The Policy object. - obs_space: The Policy's observation space. - action_space: The Policy's action space. - config: The Policy's config. - """ - TargetNetworkMixin.__init__(policy) - - -def rmsprop_optimizer( - policy: Policy, config: AlgorithmConfigDict -) -> "tf.keras.optimizers.Optimizer": - if policy.config["framework"] == "tf2": - return tf.keras.optimizers.RMSprop( - learning_rate=policy.cur_lr, - epsilon=config["rmsprop_epsilon"], - weight_decay=0.95, - momentum=0.0, - centered=True, - ) - else: - return tf1.train.RMSPropOptimizer( - learning_rate=policy.cur_lr, - epsilon=config["rmsprop_epsilon"], - decay=0.95, - momentum=0.0, - centered=True, - ) - - -SlateQTFPolicy = build_tf_policy( - name="SlateQTFPolicy", - get_default_config=lambda: rllib_slate_q.slate_q.slateq.SlateQConfig(), # noqa - # Build model, loss functions, and optimizers - make_model=build_slateq_model, - loss_fn=build_slateq_losses, - stats_fn=build_slateq_stats, - extra_learn_fetches_fn=lambda policy: {"td_error": policy._td_error}, - optimizer_fn=rmsprop_optimizer, - # Define how to act. - action_distribution_fn=action_distribution_fn, - compute_gradients_fn=clip_gradients, - before_init=setup_early, - before_loss_init=setup_mid_mixins, - after_init=setup_late_mixins, - mixins=[LearningRateSchedule, TargetNetworkMixin], -) diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_torch_model.py b/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_torch_model.py deleted file mode 100644 index eb3f4265ba07d..0000000000000 --- a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_torch_model.py +++ /dev/null @@ -1,186 +0,0 @@ -from typing import List, Sequence - -import gymnasium as gym - -from ray.rllib.models.torch.misc import SlimFC -from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.typing import ModelConfigDict, TensorType - -torch, nn = try_import_torch() -F = None -if nn: - F = nn.functional - - -class QValueModel(nn.Module): - def __init__( - self, - obs_space: gym.spaces.Space, - fcnet_hiddens_per_candidate=(256, 32), - ): - """Initializes a QValueModel instance. - - Each document candidate receives one full Q-value stack, defined by - `fcnet_hiddens_per_candidate`. The input to each of these Q-value stacks - is always {[user] concat [document[i]] for i in document_candidates}. - - Extra model kwargs: - fcnet_hiddens_per_candidate: List of layer-sizes for each(!) of the - candidate documents. - """ - super().__init__() - - self.orig_obs_space = obs_space - self.embedding_size = self.orig_obs_space["doc"]["0"].shape[0] - self.num_candidates = len(self.orig_obs_space["doc"]) - assert self.orig_obs_space["user"].shape[0] == self.embedding_size - - self.q_nets = nn.ModuleList() - for i in range(self.num_candidates): - layers = nn.Sequential() - ins = 2 * self.embedding_size - for j, h in enumerate(fcnet_hiddens_per_candidate): - layers.add_module( - f"q_layer_{i}_{j}", - SlimFC(in_size=ins, out_size=h, activation_fn="relu"), - ) - ins = h - layers.add_module(f"q_out_{i}", SlimFC(ins, 1, activation_fn=None)) - - self.q_nets.append(layers) - - def forward(self, user: TensorType, docs: List[TensorType]) -> TensorType: - """Returns Q-values, 1 for each candidate document, given user and doc tensors. - - Args: - user: [B x u] where u=embedding of user features. - docs: List[[B x d]] where d=embedding of doc features. Each item in the - list represents one document candidate. - - Returns: - Tensor ([batch, num candidates) of Q-values. - 1 Q-value per document candidate. - """ - q_outs = [] - for i in range(self.num_candidates): - user_cat_doc = torch.cat([user, docs[i]], dim=1) - q_outs.append(self.q_nets[i](user_cat_doc)) - - return torch.cat(q_outs, dim=1) - - -class UserChoiceModel(nn.Module): - """The user choice model for SlateQ. - - This class implements a multinomial logit model for predicting user clicks. - - Under this model, the click probability of a document is proportional to: - - .. math:: - \exp(\text{beta} * \text{doc_user_affinity} + \text{score_no_click}) - """ - - def __init__(self): - """Initializes a UserChoiceModel instance.""" - super().__init__() - self.beta = nn.Parameter(torch.tensor(0.0, dtype=torch.float)) - self.score_no_click = nn.Parameter(torch.tensor(0.0, dtype=torch.float)) - - def forward(self, user: TensorType, doc: TensorType) -> TensorType: - """Evaluate the user choice model. - - This function outputs user click scores for candidate documents. The - exponentials of these scores are proportional user click probabilities. - Here we return the scores unnormalized because only some of the - documents will be selected and shown to the user. - - Args: - user: User embeddings of shape (batch_size, user embedding size). - doc: Doc embeddings of shape (batch_size, num_docs, doc embedding size). - - Returns: - score: logits of shape (batch_size, num_docs + 1), - where the last dimension represents no_click. - """ - batch_size = user.shape[0] - # Reduce across the embedding axis. - s = torch.einsum("be,bde->bd", user, doc) - # s=[batch, num-docs] - - # Multiply with learnable single "click" weight. - s = s * self.beta - # Add the learnable no-click score. - s = torch.cat([s, self.score_no_click.expand((batch_size, 1))], dim=1) - - return s - - -class SlateQTorchModel(TorchModelV2, nn.Module): - """Initializes a SlateQTFModel instance. - - Model includes both the user choice model and the Q-value model. - - For the Q-value model, each document candidate receives one full Q-value - stack, defined by `fcnet_hiddens_per_candidate`. The input to each of these - Q-value stacks is always {[user] concat [document[i]] for i in document_candidates}. - - Extra model kwargs: - fcnet_hiddens_per_candidate: List of layer-sizes for each(!) of the - candidate documents. - """ - - def __init__( - self, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - num_outputs: int, - model_config: ModelConfigDict, - name: str, - *, - fcnet_hiddens_per_candidate: Sequence[int] = (256, 32), - double_q: bool = True, - ): - """Initializes a SlateQModel instance. - - Args: - user_embedding_size: The size of the user embedding (number of - user specific features). - doc_embedding_size: The size of the doc embedding (number of doc - specific features). - num_docs: The number of docs to select a slate from. Note that the slate - size is inferred from the action space. - fcnet_hiddens_per_candidate: List of layer-sizes for each(!) of the - candidate documents. - double_q: Whether "double Q-learning" is applied in the loss function. - """ - nn.Module.__init__(self) - TorchModelV2.__init__( - self, - obs_space, - action_space, - # This required parameter (num_outputs) seems redundant: it has no - # real impact, and can be set arbitrarily. TODO: fix this. - num_outputs=0, - model_config=model_config, - name=name, - ) - self.num_outputs = num_outputs - - self.choice_model = UserChoiceModel() - - self.q_model = QValueModel(self.obs_space, fcnet_hiddens_per_candidate) - - def get_q_values(self, user: TensorType, docs: List[TensorType]) -> TensorType: - """Returns Q-values, 1 for each candidate document, given user and doc tensors. - - Args: - user: [B x u] where u=embedding of user features. - docs: List[[B x d]] where d=embedding of doc features. Each item in the - list represents one document candidate. - - Returns: - Tensor ([batch, num candidates) of Q-values. - 1 Q-value per document candidate. - """ - return self.q_model(user, docs) diff --git a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_torch_policy.py b/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_torch_policy.py deleted file mode 100644 index e2a203cbe955f..0000000000000 --- a/rllib_contrib/slate_q/src/rllib_slate_q/slate_q/slateq_torch_policy.py +++ /dev/null @@ -1,439 +0,0 @@ -"""PyTorch policy class used for SlateQ.""" - -import logging -from typing import Dict, Tuple, Type - -import gymnasium as gym -import numpy as np -from rllib_slate_q.slate_q.slateq_torch_model import SlateQTorchModel - -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.torch.torch_action_dist import ( - TorchCategorical, - TorchDistributionWrapper, -) -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.policy_template import build_policy_class -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import TargetNetworkMixin -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import ( - apply_grad_clipping, - concat_multi_gpu_td_errors, - convert_to_torch_tensor, - huber_loss, -) -from ray.rllib.utils.typing import AlgorithmConfigDict, TensorType - -torch, nn = try_import_torch() -logger = logging.getLogger(__name__) - - -def build_slateq_model_and_distribution( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]: - """Build models for SlateQ - - Args: - policy: The policy, which will use the model for optimization. - obs_space: The policy's observation space. - action_space: The policy's action space. - config: The Algorithm's config dict. - - Returns: - Tuple consisting of 1) Q-model and 2) an action distribution class. - """ - model = SlateQTorchModel( - obs_space, - action_space, - num_outputs=action_space.nvec[0], - model_config=config["model"], - name="slateq_model", - fcnet_hiddens_per_candidate=config["fcnet_hiddens_per_candidate"], - ) - - policy.target_model = SlateQTorchModel( - obs_space, - action_space, - num_outputs=action_space.nvec[0], - model_config=config["model"], - name="target_slateq_model", - fcnet_hiddens_per_candidate=config["fcnet_hiddens_per_candidate"], - ) - - return model, TorchCategorical - - -def build_slateq_losses( - policy: Policy, - model: ModelV2, - _, - train_batch: SampleBatch, -) -> TensorType: - """Constructs the choice- and Q-value losses for the SlateQTorchPolicy. - - Args: - policy: The Policy to calculate the loss for. - model: The Model to calculate the loss for. - train_batch: The training data. - - Returns: - The user-choice- and Q-value loss tensors. - """ - - # B=batch size - # S=slate size - # C=num candidates - # E=embedding size - # A=number of all possible slates - - # Q-value computations. - # --------------------- - # action.shape: [B, S] - actions = train_batch[SampleBatch.ACTIONS] - - observation = convert_to_torch_tensor( - train_batch[SampleBatch.OBS], device=actions.device - ) - # user.shape: [B, E] - user_obs = observation["user"] - batch_size, embedding_size = user_obs.shape - # doc.shape: [B, C, E] - doc_obs = list(observation["doc"].values()) - - A, S = policy.slates.shape - - # click_indicator.shape: [B, S] - click_indicator = torch.stack( - [k["click"] for k in observation["response"]], 1 - ).float() - # item_reward.shape: [B, S] - item_reward = torch.stack([k["watch_time"] for k in observation["response"]], 1) - # q_values.shape: [B, C] - q_values = model.get_q_values(user_obs, doc_obs) - # slate_q_values.shape: [B, S] - slate_q_values = torch.take_along_dim(q_values, actions.long(), dim=-1) - # Only get the Q from the clicked document. - # replay_click_q.shape: [B] - replay_click_q = torch.sum(slate_q_values * click_indicator, dim=1) - - # Target computations. - # -------------------- - next_obs = convert_to_torch_tensor( - train_batch[SampleBatch.NEXT_OBS], device=actions.device - ) - - # user.shape: [B, E] - user_next_obs = next_obs["user"] - # doc.shape: [B, C, E] - doc_next_obs = list(next_obs["doc"].values()) - # Only compute the watch time reward of the clicked item. - reward = torch.sum(item_reward * click_indicator, dim=1) - - # TODO: Find out, whether it's correct here to use obs, not next_obs! - # Dopamine uses obs, then next_obs only for the score. - # next_q_values = policy.target_model.get_q_values(user_next_obs, doc_next_obs) - next_q_values = policy.target_models[model].get_q_values(user_obs, doc_obs) - scores, score_no_click = score_documents(user_next_obs, doc_next_obs) - - # next_q_values_slate.shape: [B, A, S] - indices = policy.slates_indices.to(next_q_values.device) - next_q_values_slate = torch.take_along_dim(next_q_values, indices, dim=1).reshape( - [-1, A, S] - ) - # scores_slate.shape [B, A, S] - scores_slate = torch.take_along_dim(scores, indices, dim=1).reshape([-1, A, S]) - # score_no_click_slate.shape: [B, A] - score_no_click_slate = torch.reshape( - torch.tile(score_no_click, policy.slates.shape[:1]), [batch_size, -1] - ) - - # next_q_target_slate.shape: [B, A] - next_q_target_slate = torch.sum(next_q_values_slate * scores_slate, dim=2) / ( - torch.sum(scores_slate, dim=2) + score_no_click_slate - ) - next_q_target_max, _ = torch.max(next_q_target_slate, dim=1) - - target = reward + policy.config["gamma"] * next_q_target_max * ( - 1.0 - train_batch[SampleBatch.TERMINATEDS].float() - ) - target = target.detach() - - clicked = torch.sum(click_indicator, dim=1) - mask_clicked_slates = clicked > 0 - clicked_indices = torch.arange(batch_size).to(mask_clicked_slates.device) - clicked_indices = torch.masked_select(clicked_indices, mask_clicked_slates) - # Clicked_indices is a vector and torch.gather selects the batch dimension. - q_clicked = torch.gather(replay_click_q, 0, clicked_indices) - target_clicked = torch.gather(target, 0, clicked_indices) - - td_error = torch.where( - clicked.bool(), - replay_click_q - target, - torch.zeros_like(train_batch[SampleBatch.REWARDS]), - ) - if policy.config["use_huber"]: - loss = huber_loss(td_error, delta=policy.config["huber_threshold"]) - else: - loss = torch.pow(td_error, 2.0) - loss = torch.mean(loss) - td_error = torch.abs(td_error) - mean_td_error = torch.mean(td_error) - - # Store values for stats function in model (tower), such that for - # multi-GPU, we do not override them during the parallel loss phase. - model.tower_stats["q_values"] = torch.mean(q_values) - model.tower_stats["q_clicked"] = torch.mean(q_clicked) - model.tower_stats["scores"] = torch.mean(scores) - model.tower_stats["score_no_click"] = torch.mean(score_no_click) - model.tower_stats["slate_q_values"] = torch.mean(slate_q_values) - model.tower_stats["replay_click_q"] = torch.mean(replay_click_q) - model.tower_stats["bellman_reward"] = torch.mean(reward) - model.tower_stats["next_q_values"] = torch.mean(next_q_values) - model.tower_stats["target"] = torch.mean(target) - model.tower_stats["next_q_target_slate"] = torch.mean(next_q_target_slate) - model.tower_stats["next_q_target_max"] = torch.mean(next_q_target_max) - model.tower_stats["target_clicked"] = torch.mean(target_clicked) - model.tower_stats["q_loss"] = loss - model.tower_stats["td_error"] = td_error - model.tower_stats["mean_td_error"] = mean_td_error - model.tower_stats["mean_actions"] = torch.mean(actions.float()) - - # selected_doc.shape: [batch_size, slate_size, embedding_size] - selected_doc = torch.gather( - # input.shape: [batch_size, num_docs, embedding_size] - torch.stack(doc_obs, 1), - 1, - # index.shape: [batch_size, slate_size, embedding_size] - actions.unsqueeze(2).expand(-1, -1, embedding_size).long(), - ) - - scores = model.choice_model(user_obs, selected_doc) - - # click_indicator.shape: [batch_size, slate_size] - # no_clicks.shape: [batch_size, 1] - no_clicks = 1 - torch.sum(click_indicator, 1, keepdim=True) - # targets.shape: [batch_size, slate_size+1] - targets = torch.cat([click_indicator, no_clicks], dim=1) - choice_loss = nn.functional.cross_entropy(scores, torch.argmax(targets, dim=1)) - # print(model.choice_model.a.item(), model.choice_model.b.item()) - - model.tower_stats["choice_loss"] = choice_loss - - return choice_loss, loss - - -def build_slateq_stats(policy: Policy, batch) -> Dict[str, TensorType]: - stats = { - "q_values": torch.mean(torch.stack(policy.get_tower_stats("q_values"))), - "q_clicked": torch.mean(torch.stack(policy.get_tower_stats("q_clicked"))), - "scores": torch.mean(torch.stack(policy.get_tower_stats("scores"))), - "score_no_click": torch.mean( - torch.stack(policy.get_tower_stats("score_no_click")) - ), - "slate_q_values": torch.mean( - torch.stack(policy.get_tower_stats("slate_q_values")) - ), - "replay_click_q": torch.mean( - torch.stack(policy.get_tower_stats("replay_click_q")) - ), - "bellman_reward": torch.mean( - torch.stack(policy.get_tower_stats("bellman_reward")) - ), - "next_q_values": torch.mean( - torch.stack(policy.get_tower_stats("next_q_values")) - ), - "target": torch.mean(torch.stack(policy.get_tower_stats("target"))), - "next_q_target_slate": torch.mean( - torch.stack(policy.get_tower_stats("next_q_target_slate")) - ), - "next_q_target_max": torch.mean( - torch.stack(policy.get_tower_stats("next_q_target_max")) - ), - "target_clicked": torch.mean( - torch.stack(policy.get_tower_stats("target_clicked")) - ), - "q_loss": torch.mean(torch.stack(policy.get_tower_stats("q_loss"))), - "mean_actions": torch.mean(torch.stack(policy.get_tower_stats("mean_actions"))), - "choice_loss": torch.mean(torch.stack(policy.get_tower_stats("choice_loss"))), - # "choice_beta": torch.mean(torch.stack(policy.get_tower_stats("choice_beta"))), - # "choice_score_no_click": torch.mean( - # torch.stack(policy.get_tower_stats("choice_score_no_click")) - # ), - } - # model_stats = { - # k: torch.mean(var) - # for k, var in policy.model.trainable_variables(as_dict=True).items() - # } - # stats.update(model_stats) - - return stats - - -def action_distribution_fn( - policy: Policy, - model: SlateQTorchModel, - input_dict, - *, - explore, - is_training, - **kwargs, -): - """Determine which action to take.""" - - observation = input_dict[SampleBatch.OBS] - - # user.shape: [B, E] - user_obs = observation["user"] - doc_obs = list(observation["doc"].values()) - - # Compute scores per candidate. - scores, score_no_click = score_documents(user_obs, doc_obs) - # Compute Q-values per candidate. - q_values = model.get_q_values(user_obs, doc_obs) - - per_slate_q_values = get_per_slate_q_values( - policy, score_no_click, scores, q_values - ) - if not hasattr(model, "slates"): - model.slates = policy.slates - return per_slate_q_values, TorchCategorical, [] - - -def get_per_slate_q_values(policy, score_no_click, scores, q_values): - indices = policy.slates_indices.to(scores.device) - A, S = policy.slates.shape - slate_q_values = torch.take_along_dim(scores * q_values, indices, dim=1).reshape( - [-1, A, S] - ) - slate_scores = torch.take_along_dim(scores, indices, dim=1).reshape([-1, A, S]) - slate_normalizer = torch.sum(slate_scores, dim=2) + score_no_click.unsqueeze(1) - - slate_q_values = slate_q_values / slate_normalizer.unsqueeze(2) - slate_sum_q_values = torch.sum(slate_q_values, dim=2) - return slate_sum_q_values - - -def score_documents( - user_obs, doc_obs, no_click_score=1.0, multinomial_logits=False, min_normalizer=-1.0 -): - """Computes dot-product scores for user vs doc (plus no-click) feature vectors.""" - - # Dot product between used and each document feature vector. - scores_per_candidate = torch.sum( - torch.multiply(user_obs.unsqueeze(1), torch.stack(doc_obs, dim=1)), dim=2 - ) - # Compile a constant no-click score tensor. - score_no_click = torch.full( - size=[user_obs.shape[0], 1], fill_value=no_click_score - ).to(scores_per_candidate.device) - # Concatenate click and no-click scores. - all_scores = torch.cat([scores_per_candidate, score_no_click], dim=1) - - # Logits: Softmax to yield probabilities. - if multinomial_logits: - all_scores = nn.functional.softmax(all_scores) - # Multinomial proportional model: Shift to `[0.0,..[`. - else: - all_scores = all_scores - min_normalizer - - # Return click (per candidate document) and no-click scores. - return all_scores[:, :-1], all_scores[:, -1] - - -def setup_early(policy, obs_space, action_space, config): - """Obtain all possible slates given current docs in the candidate set.""" - - num_candidates = action_space.nvec[0] - slate_size = len(action_space.nvec) - - mesh_args = [torch.Tensor(list(range(num_candidates)))] * slate_size - slates = torch.stack(torch.meshgrid(*mesh_args), dim=-1) - slates = torch.reshape(slates, shape=(-1, slate_size)) - # Filter slates that include duplicates to ensure each document is picked - # at most once. - unique_mask = [] - for i in range(slates.shape[0]): - x = slates[i] - unique_mask.append(len(x) == len(torch.unique(x))) - unique_mask = torch.Tensor(unique_mask).bool().unsqueeze(1) - # slates.shape: [A, S] - slates = torch.masked_select(slates, mask=unique_mask).reshape([-1, slate_size]) - - # Store all possible slates only once in policy object. - policy.slates = slates.long() - # [1, AxS] Useful for torch.take_along_dim() - policy.slates_indices = policy.slates.reshape(-1).unsqueeze(0) - - -def optimizer_fn( - policy: Policy, config: AlgorithmConfigDict -) -> Tuple["torch.optim.Optimizer"]: - optimizer_choice = torch.optim.Adam( - policy.model.choice_model.parameters(), lr=config["lr_choice_model"] - ) - optimizer_q_value = torch.optim.RMSprop( - policy.model.q_model.parameters(), - lr=config["lr"], - eps=config["rmsprop_epsilon"], - momentum=0.0, - weight_decay=0.95, - centered=True, - ) - return optimizer_choice, optimizer_q_value - - -def postprocess_fn_add_next_actions_for_sarsa( - policy: Policy, batch: SampleBatch, other_agent=None, episode=None -) -> SampleBatch: - """Add next_actions to SampleBatch for SARSA training""" - if policy.config["slateq_strategy"] == "SARSA": - if not batch.is_terminated_or_truncated() and policy._no_tracing is False: - raise RuntimeError( - "Expected a complete episode in each sample batch. " - f"But this batch is not: {batch}." - ) - batch["next_actions"] = np.roll(batch["actions"], -1, axis=0) - - return batch - - -def setup_late_mixins( - policy: Policy, - obs_space: gym.spaces.Space, - action_space: gym.spaces.Space, - config: AlgorithmConfigDict, -) -> None: - """Call all mixin classes' constructors before SlateQTorchPolicy initialization. - - Args: - policy: The Policy object. - obs_space: The Policy's observation space. - action_space: The Policy's action space. - config: The Policy's config. - """ - TargetNetworkMixin.__init__(policy) - - -SlateQTorchPolicy = build_policy_class( - name="SlateQTorchPolicy", - framework="torch", - get_default_config=lambda: rllib_slate_q.slate_q.slateq.SlateQConfig(), # noqa - before_init=setup_early, - after_init=setup_late_mixins, - loss_fn=build_slateq_losses, - stats_fn=build_slateq_stats, - # Build model, loss functions, and optimizers - make_model_and_action_dist=build_slateq_model_and_distribution, - optimizer_fn=optimizer_fn, - # Define how to act. - action_distribution_fn=action_distribution_fn, - # Post processing sampled trajectory data. - # postprocess_fn=postprocess_fn_add_next_actions_for_sarsa, - extra_grad_process_fn=apply_grad_clipping, - extra_learn_fetches_fn=concat_multi_gpu_td_errors, - mixins=[TargetNetworkMixin], -) diff --git a/rllib_contrib/slate_q/tests/test_slate_q.py b/rllib_contrib/slate_q/tests/test_slate_q.py deleted file mode 100644 index d0df979f7b3d1..0000000000000 --- a/rllib_contrib/slate_q/tests/test_slate_q.py +++ /dev/null @@ -1,50 +0,0 @@ -import unittest - -import rllib_slate_q.slate_q.slateq as slateq -from rllib_slate_q.env.recommender_system_envs_with_recsim import ( - InterestEvolutionRecSimEnv, -) - -import ray -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) - - -class TestSlateQ(unittest.TestCase): - """Sanity tests for Slateq algorithm.""" - - def setUp(self): - ray.init() - - def tearDown(self): - ray.shutdown() - - def test_slateq_compilation(self): - """Test whether SlateQ can be built with both frameworks.""" - config = ( - slateq.SlateQConfig() - .environment(env=InterestEvolutionRecSimEnv) - .training(num_steps_sampled_before_learning_starts=1000) - ) - - num_iterations = 1 - - for _ in framework_iterator(config, with_eager_tracing=True): - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/slate_q/tuned_examples/__init__.py b/rllib_contrib/slate_q/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/slate_q/tuned_examples/interest-evolution-10-candidates-recsim-env-slate-q-fake-gpus.yaml b/rllib_contrib/slate_q/tuned_examples/interest-evolution-10-candidates-recsim-env-slate-q-fake-gpus.yaml deleted file mode 100644 index 19e1c206ccf3d..0000000000000 --- a/rllib_contrib/slate_q/tuned_examples/interest-evolution-10-candidates-recsim-env-slate-q-fake-gpus.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -interest-evolution-recsim-env-slateq: - env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv - run: SlateQ - stop: - sampler_results/episode_reward_mean: 160.0 - timesteps_total: 100000 - config: - framework: torch - - # RLlib/RecSim wrapper specific settings: - env_config: - # Env class specified above takes one `config` arg in its c'tor: - config: - # Each step, sample `num_candidates` documents using the env-internal - # document sampler model (a logic that creates n documents to select - # the slate from). - resample_documents: true - num_candidates: 10 - # How many documents to recommend (out of `num_candidates`) each - # timestep? - slate_size: 2 - # Should the action space be purely Discrete? Useful for algos that - # don't support MultiDiscrete (e.g. DQN or Bandits). - # SlateQ handles MultiDiscrete action spaces. - convert_to_discrete_action_space: false - seed: 0 - - # Fake 2 GPUs. - num_gpus: 2 - _fake_gpus: true - - exploration_config: - warmup_timesteps: 10000 - epsilon_timesteps: 25000 - - replay_buffer_config: - capacity: 100000 - num_steps_sampled_before_learning_starts: 10000 - - # Double learning rate and batch size. - lr: 0.002 - train_batch_size: 64 - - target_network_update_freq: 3200 - - metrics_num_episodes_for_smoothing: 200 diff --git a/rllib_contrib/slate_q/tuned_examples/interest-evolution-10-candidates-recsim-env-slate-q.yaml b/rllib_contrib/slate_q/tuned_examples/interest-evolution-10-candidates-recsim-env-slate-q.yaml deleted file mode 100644 index eb83f99fd4c9f..0000000000000 --- a/rllib_contrib/slate_q/tuned_examples/interest-evolution-10-candidates-recsim-env-slate-q.yaml +++ /dev/null @@ -1,43 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -interest-evolution-recsim-env-slateq: - env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv - run: SlateQ - stop: - sampler_results/episode_reward_mean: 160.0 - timesteps_total: 120000 - config: - framework: torch - - # RLlib/RecSim wrapper specific settings: - env_config: - # Env class specified above takes one `config` arg in its c'tor: - config: - # Each step, sample `num_candidates` documents using the env-internal - # document sampler model (a logic that creates n documents to select - # the slate from). - resample_documents: true - num_candidates: 10 - # How many documents to recommend (out of `num_candidates`) each - # timestep? - slate_size: 2 - # Should the action space be purely Discrete? Useful for algos that - # don't support MultiDiscrete (e.g. DQN or Bandits). - # SlateQ handles MultiDiscrete action spaces. - convert_to_discrete_action_space: false - seed: 0 - - exploration_config: - warmup_timesteps: 10000 - epsilon_timesteps: 25000 - - replay_buffer_config: - capacity: 100000 - num_steps_sampled_before_learning_starts: 10000 - - lr: 0.001 - - target_network_update_freq: 3200 - - metrics_num_episodes_for_smoothing: 200 diff --git a/rllib_contrib/slate_q/tuned_examples/interest-evolution-50-candidates-recsim-env-slate-q.yaml b/rllib_contrib/slate_q/tuned_examples/interest-evolution-50-candidates-recsim-env-slate-q.yaml deleted file mode 100644 index e88982d9350e9..0000000000000 --- a/rllib_contrib/slate_q/tuned_examples/interest-evolution-50-candidates-recsim-env-slate-q.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -interest-evolution-recsim-env-slateq: - env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv - run: SlateQ - stop: - sampler_results/episode_reward_mean: 162.0 - timesteps_total: 300000 - config: - framework: tf2 - - # RLlib/RecSim wrapper specific settings: - env_config: - # Env class specified above takes one `config` arg in its c'tor: - config: - # Each step, sample `num_candidates` documents using the env-internal - # document sampler model (a logic that creates n documents to select - # the slate from). - resample_documents: true - num_candidates: 50 - # How many documents to recommend (out of `num_candidates`) each - # timestep? - slate_size: 2 - # Should the action space be purely Discrete? Useful for algos that - # don't support MultiDiscrete (e.g. DQN or Bandits). - # SlateQ handles MultiDiscrete action spaces. - convert_to_discrete_action_space: false - seed: 0 - - exploration_config: - warmup_timesteps: 20000 - epsilon_timesteps: 70000 - - replay_buffer_config: - capacity: 500000 - - lr: 0.00025 - - metrics_num_episodes_for_smoothing: 200 diff --git a/rllib_contrib/slate_q/tuned_examples/long-term-satisfaction-recsim-env-slate-q.yaml b/rllib_contrib/slate_q/tuned_examples/long-term-satisfaction-recsim-env-slate-q.yaml deleted file mode 100644 index 5ca3ee148a1ef..0000000000000 --- a/rllib_contrib/slate_q/tuned_examples/long-term-satisfaction-recsim-env-slate-q.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -long-term-satisfaction-recsim-env-slateq: - env: ray.rllib.examples.env.recommender_system_envs_with_recsim.LongTermSatisfactionRecSimEnv - run: SlateQ - stop: - # Random baseline rewards: - # num_candidates=20; slate_size=2; resample=true: ~951 - # num_candidates=50; slate_size=3; resample=true: ~946 - evaluation/sampler_results/episode_reward_mean: 1000.0 - timesteps_total: 200000 - config: - # Works for both tf and torch. - framework: torch - - metrics_num_episodes_for_smoothing: 200 - - # RLlib/RecSim wrapper specific settings: - env_config: - config: - # Each step, sample `num_candidates` documents using the env-internal - # document sampler model (a logic that creates n documents to select - # the slate from). - resample_documents: true - num_candidates: 50 - # How many documents to recommend (out of `num_candidates`) each - # timestep? - slate_size: 2 - # Should the action space be purely Discrete? Useful for algos that - # don't support MultiDiscrete (e.g. DQN or Bandits). - # SlateQ handles MultiDiscrete action spaces. - convert_to_discrete_action_space: false - seed: 42 - - exploration_config: - warmup_timesteps: 10000 - epsilon_timesteps: 60000 - - target_network_update_freq: 3200 diff --git a/rllib_contrib/slate_q/tuned_examples/parametric-item-reco-env-slate-q.yaml b/rllib_contrib/slate_q/tuned_examples/parametric-item-reco-env-slate-q.yaml deleted file mode 100644 index 4787d22624115..0000000000000 --- a/rllib_contrib/slate_q/tuned_examples/parametric-item-reco-env-slate-q.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -parametric-item-reco-env-slateq: - env: ray.rllib.examples.env.bandit_envs_recommender_system.ParametricItemRecoEnv - run: SlateQ - stop: - #evaluation/sampler_results/episode_reward_mean: 48.0 - timesteps_total: 200000 - config: - # SlateQ only supported for torch so far. - framework: torch - - metrics_num_episodes_for_smoothing: 200 - - exploration_config: - temperature: 0.7 - - # Env c'tor kwargs: - env_config: - config: - slate_q: true - num_users: 50 - num_items: 1000 - num_candidates: 50 - slate_size: 1 - feature_dim: 16 - - grad_clip: 10.0 - #double_q: false - #slateq_strategy: MYOP - - # Larger networks seem to help (large obs/action spaces). - hiddens: [512, 512] - - # Larger batch sizes seem to help (more stability, even with higher lr). - train_batch_size: 64 - - num_workers: 0 - num_gpus: 0 - - lr_choice_model: 0.01 - lr_q_model: 0.01 - - target_network_update_freq: 500 - tau: 1.0 - - # Evaluation settings. - evaluation_interval: 1 - evaluation_num_workers: 4 - evaluation_duration: 200 - evaluation_duration_unit: episodes - evaluation_parallel_to_training: true diff --git a/rllib_contrib/slate_q/tuned_examples/recomm-sys001-slate-q.yaml b/rllib_contrib/slate_q/tuned_examples/recomm-sys001-slate-q.yaml deleted file mode 100644 index e2f795e161e97..0000000000000 --- a/rllib_contrib/slate_q/tuned_examples/recomm-sys001-slate-q.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -recomm-sys001-slateq: - env: ray.rllib.examples.env.recommender_system_envs.RecommSys001 - run: SlateQ - stop: - #evaluation/sampler_results/episode_reward_mean: 48.0 - timesteps_total: 200000 - config: - # SlateQ only supported for torch so far. - framework: torch - - metrics_num_episodes_for_smoothing: 1000 - - # Env c'tor kwargs: - env_config: - # Number of different categories a doc can have and a user can - # have a preference for. - num_categories: 5 - # Number of docs to choose (a slate) from each timestep. - num_docs_to_select_from: 50 - # Slate size. - slate_size: 2 - # Re-sample docs each timesteps. - num_docs_in_db: 1000 - # Re-sample user each episode. - num_users_in_db: 1000 - # User time budget (determines lengths of episodes). - user_time_budget: 60.0 - - grad_clip: 2.0 - - # Larger networks seem to help (large obs/action spaces). - hiddens: [512, 512] - - # Larger batch sizes seem to help (more stability, even with higher lr). - train_batch_size: 32 - - num_workers: 0 - num_gpus: 0 - - lr_choice_model: 0.002 - lr_q_model: 0.002 - - target_network_update_freq: 500 - tau: 1.0 - - # Evaluation settings. - evaluation_interval: 1 - evaluation_num_workers: 4 - evaluation_duration: 200 - evaluation_duration_unit: episodes - evaluation_parallel_to_training: true diff --git a/rllib_contrib/td3/BUILD b/rllib_contrib/td3/BUILD deleted file mode 100644 index 6eff50bd33170..0000000000000 --- a/rllib_contrib/td3/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# Examples - -py_test( - name = "example_td3_pendulum_v1", - main = "td3_pendulum_v1.py", - tags = ["team:rllib", "example"], - size = "large", - srcs = ["examples/td3_pendulum_v1.py"], - args = ["--run-as-test"] -) - -# Learning Tests - -# py_test( -# name = "learning_tests_pendulum_td3", -# main = "run_regression_tests.py", -# tags = ["team:rllib", "learning_tests", "rllib_contrib"], -# size = "large", -# srcs = ["run_regression_tests.py"], -# data = ["tuned_examples/pendulum-td3.yaml"], -# args = ["--dir=td3/tuned_examples/"] -# ) - -# Compilation Tests - -py_test( - name = "test_td3", - tags = ["team:rllib", "algorithms_dir"], - size = "large", - srcs = ["tests/test_td3.py"] -) diff --git a/rllib_contrib/td3/README.md b/rllib_contrib/td3/README.md deleted file mode 100644 index cb96b448b1f38..0000000000000 --- a/rllib_contrib/td3/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# TD3 (Twin Delayed DDPG) - -[TD3](https://arxiv.org/pdf/1802.09477) While DDPG can achieve great performance sometimes, it is frequently brittle with respect to hyperparameters and other kinds of tuning. A common failure mode for DDPG is that the learned Q-function begins to dramatically overestimate Q-values, which then leads to the policy breaking, because it exploits the errors in the Q-function. Twin Delayed DDPG (TD3) is an algorithm that addresses this issue by introducing three critical tricks: - -Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions. - -Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently than the Q-function. The paper recommends one policy update for every two Q-function updates. - -Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for the policy to exploit Q-function errors by smoothing out Q along changes in action. - -Together, these three tricks result in substantially improved performance over baseline DDPG. - - -## Installation - -``` -conda create -n rllib-td3 python=3.10 -conda activate rllib-td3 -pip install -r requirements.txt -pip install -e '.[development]' -``` - -## Usage - -[TD3 Example]() \ No newline at end of file diff --git a/rllib_contrib/td3/examples/td3_pendulum_v1.py b/rllib_contrib/td3/examples/td3_pendulum_v1.py deleted file mode 100644 index 799a5ba79053c..0000000000000 --- a/rllib_contrib/td3/examples/td3_pendulum_v1.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse - -from rllib_td3.td3 import TD3, TD3Config - -import ray -from ray import air, tune -from ray.rllib.utils.test_utils import check_learning_achieved - - -def get_cli_args(): - """Create CLI parser and return parsed arguments""" - parser = argparse.ArgumentParser() - parser.add_argument("--run-as-test", action="store_true", default=False) - args = parser.parse_args() - print(f"Running with following CLI args: {args}") - return args - - -if __name__ == "__main__": - args = get_cli_args() - - ray.init() - - config = ( - TD3Config() - .framework("torch") - .environment("Pendulum-v1") - .training( - actor_hiddens=[64, 64], - critic_hiddens=[64, 64], - replay_buffer_config={"type": "MultiAgentReplayBuffer"}, - num_steps_sampled_before_learning_starts=5000, - ) - .exploration(exploration_config={"random_timesteps": 5000}) - ) - - stop_reward = -900 - - tuner = tune.Tuner( - TD3, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={ - "sampler_results/episode_reward_mean": stop_reward, - "timesteps_total": 100000, - }, - failure_config=air.FailureConfig(fail_fast="raise"), - ), - ) - results = tuner.fit() - - if args.run_as_test: - check_learning_achieved(results, stop_reward) diff --git a/rllib_contrib/td3/pyproject.toml b/rllib_contrib/td3/pyproject.toml deleted file mode 100644 index 71fee404936a5..0000000000000 --- a/rllib_contrib/td3/pyproject.toml +++ /dev/null @@ -1,18 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] - -[project] -name = "rllib-td3" -authors = [{name = "Anyscale Inc."}] -version = "0.1.0" -description = "" -readme = "README.md" -requires-python = ">=3.7, <3.11" -dependencies = ["gymnasium==0.26.3", "ray[rllib]==2.5.1"] - -[project.optional-dependencies] -development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "torch==1.12.0", "numpy<2"] diff --git a/rllib_contrib/td3/requirements.txt b/rllib_contrib/td3/requirements.txt deleted file mode 100644 index f16db019bb51d..0000000000000 --- a/rllib_contrib/td3/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.11.1 -tensorflow-probability==0.19.0 -torch==1.12.0 -numpy<2 diff --git a/rllib_contrib/td3/src/rllib_td3/td3/__init__.py b/rllib_contrib/td3/src/rllib_td3/td3/__init__.py deleted file mode 100644 index 6c5b2f275c3e9..0000000000000 --- a/rllib_contrib/td3/src/rllib_td3/td3/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from rllib_td3.td3.td3 import TD3, TD3Config - -from ray.tune.registry import register_trainable - -__all__ = ["TD3Config", "TD3"] - -register_trainable("rllib-contrib-td3", TD3) diff --git a/rllib_contrib/td3/src/rllib_td3/td3/td3.py b/rllib_contrib/td3/src/rllib_td3/td3/td3.py deleted file mode 100644 index 9b54c51fe7f68..0000000000000 --- a/rllib_contrib/td3/src/rllib_td3/td3/td3.py +++ /dev/null @@ -1,108 +0,0 @@ -"""A more stable successor to TD3. - -By default, this uses a near-identical configuration to that reported in the -TD3 paper. -""" -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.algorithms.ddpg.ddpg import DDPG, DDPGConfig -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import DEPRECATED_VALUE - - -class TD3Config(DDPGConfig): - """Defines a configuration class from which a TD3 Algorithm can be built. - - Example: - >>> from ray.rllib.algorithms.td3 import TD3Config - >>> config = TD3Config().training(lr=0.01).resources(num_gpus=1) - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run one training iteration. - >>> algo = config.build(env="Pendulum-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.td3 import TD3Config - >>> from ray import air - >>> from ray import tune - >>> config = TD3Config() - >>> # Print out some default values. - >>> print(config.lr) # doctest: +SKIP - >>> # Update the config object. - >>> config = config.training(lr=tune.grid_search( # doctest: +SKIP - ... [0.001, 0.0001])) # doctest: +SKIP - >>> # Set the config object's env. - >>> config.environment(env="Pendulum-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "TD3", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a TD3Config instance.""" - super().__init__(algo_class=algo_class or TD3) - - # fmt: off - # __sphinx_doc_begin__ - - # Override some of DDPG/SimpleQ/Algorithm's default values with TD3-specific - # values. - - # .training() - - # largest changes: twin Q functions, delayed policy updates, target - # smoothing, no l2-regularization. - self.twin_q = True - self.policy_delay = 2 - self.smooth_target_policy = True, - self.l2_reg = 0.0 - # Different tau (affecting target network update). - self.tau = 5e-3 - # Different batch size. - self.train_batch_size = 100 - # No prioritized replay by default (we may want to change this at some - # point). - self.replay_buffer_config = { - "type": "MultiAgentReplayBuffer", - # Specify prioritized replay by supplying a buffer type that supports - # prioritization, for example: MultiAgentPrioritizedReplayBuffer. - "prioritized_replay": DEPRECATED_VALUE, - "capacity": 1000000, - "worker_side_prioritization": False, - } - # Number of timesteps to collect from rollout workers before we start - # sampling from replay buffers for learning. Whether we count this in agent - # steps or environment steps depends on config.multi_agent(count_steps_by=..). - self.num_steps_sampled_before_learning_starts = 10000 - - # .exploration() - # TD3 uses Gaussian Noise by default. - self.exploration_config = { - # TD3 uses simple Gaussian noise on top of deterministic NN-output - # actions (after a possible pure random phase of n timesteps). - "type": "GaussianNoise", - # For how many timesteps should we return completely random - # actions, before we start adding (scaled) noise? - "random_timesteps": 10000, - # Gaussian stddev of action noise for exploration. - "stddev": 0.1, - # Scaling settings by which the Gaussian noise is scaled before - # being added to the actions. NOTE: The scale timesteps start only - # after(!) any random steps have been finished. - # By default, do not anneal over time (fixed 1.0). - "initial_scale": 1.0, - "final_scale": 1.0, - "scale_timesteps": 1, - } - # __sphinx_doc_end__ - # fmt: on - - -class TD3(DDPG): - @classmethod - @override(DDPG) - def get_default_config(cls) -> AlgorithmConfig: - return TD3Config() diff --git a/rllib_contrib/td3/tests/test_td3.py b/rllib_contrib/td3/tests/test_td3.py deleted file mode 100644 index e24d4a627e3e5..0000000000000 --- a/rllib_contrib/td3/tests/test_td3.py +++ /dev/null @@ -1,110 +0,0 @@ -import unittest - -import numpy as np -import rllib_td3.td3.td3 as td3 - -import ray -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.test_utils import ( - check, - check_compute_single_action, - check_train_results, - framework_iterator, -) - -tf1, tf, tfv = try_import_tf() - - -class TestTD3(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_td3_compilation(self): - """Test whether TD3 can be built with both frameworks.""" - config = td3.TD3Config() - - # Test against all frameworks. - for _ in framework_iterator(config, with_eager_tracing=True): - algo = config.build(env="Pendulum-v1") - num_iterations = 1 - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - algo.stop() - - def test_td3_exploration_and_with_random_prerun(self): - """Tests TD3's Exploration (w/ random actions for n timesteps).""" - config = td3.TD3Config().environment(env="Pendulum-v1") - no_random_init = config.exploration_config.copy() - random_init = { - # Act randomly at beginning ... - "random_timesteps": 30, - # Then act very closely to deterministic actions thereafter. - "stddev": 0.001, - "initial_scale": 0.001, - "final_scale": 0.001, - } - obs = np.array([0.0, 0.1, -0.1]) - - # Test against all frameworks. - for _ in framework_iterator(config, with_eager_tracing=True): - config.exploration(exploration_config=no_random_init) - # Default GaussianNoise setup. - algo = config.build() - # Setting explore=False should always return the same action. - a_ = algo.compute_single_action(obs, explore=False) - check(convert_to_numpy(algo.get_policy().global_timestep), 1) - for i in range(50): - a = algo.compute_single_action(obs, explore=False) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 2) - check(a, a_) - # explore=None (default: explore) should return different actions. - actions = [] - for i in range(50): - actions.append(algo.compute_single_action(obs)) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 52) - check(np.std(actions), 0.0, false=True) - algo.stop() - - # Check randomness at beginning. - config.exploration(exploration_config=random_init) - algo = config.build() - # ts=0 (get a deterministic action as per explore=False). - deterministic_action = algo.compute_single_action(obs, explore=False) - check(convert_to_numpy(algo.get_policy().global_timestep), 1) - # ts=1-29 (in random window). - random_a = [] - for i in range(1, 30): - random_a.append(algo.compute_single_action(obs, explore=True)) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 1) - check(random_a[-1], deterministic_action, false=True) - self.assertTrue(np.std(random_a) > 0.3) - - # ts > 30 (a=deterministic_action + scale * N[0,1]) - for i in range(50): - a = algo.compute_single_action(obs, explore=True) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 31) - check(a, deterministic_action, rtol=0.1) - - # ts >> 30 (BUT: explore=False -> expect deterministic action). - for i in range(50): - a = algo.compute_single_action(obs, explore=False) - check(convert_to_numpy(algo.get_policy().global_timestep), i + 81) - check(a, deterministic_action) - algo.stop() - - -if __name__ == "__main__": - import sys - - import pytest - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/td3/tuned_examples/__init__.py b/rllib_contrib/td3/tuned_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib_contrib/td3/tuned_examples/invertedpendulum-td3.yaml b/rllib_contrib/td3/tuned_examples/invertedpendulum-td3.yaml deleted file mode 100644 index 3f0de9249f453..0000000000000 --- a/rllib_contrib/td3/tuned_examples/invertedpendulum-td3.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -invertedpendulum-td3: - # This is a TD3 with stopping conditions and network size tuned specifically - # for InvertedPendulum. Should be able to reach 1,000 reward (the maximum - # achievable) in 10,000 to 20,000 steps. - env: InvertedPendulum-v2 - run: TD3 - stop: - sampler_results/episode_reward_mean: 9999.9 - time_total_s: 900 # 15 minutes - timesteps_total: 1000000 - config: - # Works for both torch and tf. - framework: torch - # === Model === - actor_hiddens: [32, 32] - critic_hiddens: [32, 32] - - # === Exploration === - num_steps_sampled_before_learning_starts: 1000 - exploration_config: - random_timesteps: 1000 - - # === Evaluation === - evaluation_interval: 10 - evaluation_duration: 5 diff --git a/rllib_contrib/td3/tuned_examples/mujoco-td3.yaml b/rllib_contrib/td3/tuned_examples/mujoco-td3.yaml deleted file mode 100644 index 70947efd0b9bb..0000000000000 --- a/rllib_contrib/td3/tuned_examples/mujoco-td3.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -mujoco-td3: - # Solve latest versions of the four hardest Mujoco tasks benchmarked in the - # original TD3 paper. Average return over 10 trials at end of 1,000,000 - # timesteps (taken from Table 2 of the paper) are given in parens at the end - # of reach environment name. - # - # Paper is at https://arxiv.org/pdf/1802.09477.pdf - env: - grid_search: - - HalfCheetah-v2 # (9,532.99) - - Hopper-v2 # (3,304.75) - - Walker2d-v2 # (4,565.24) - - Ant-v2 # (4,185.06) - run: TD3 - stop: - timesteps_total: 1000000 - config: - # Works for both torch and tf. - framework: torch - # === Exploration === - exploration_config: - random_timesteps: 10000 - replay_buffer_config: - type: MultiAgentReplayBuffer - num_steps_sampled_before_learning_starts: 10000 - # === Evaluation === - evaluation_interval: 10 - evaluation_duration: 10 diff --git a/rllib_contrib/td3/tuned_examples/pendulum-td3-fake-gpus.yaml b/rllib_contrib/td3/tuned_examples/pendulum-td3-fake-gpus.yaml deleted file mode 100644 index d8391f1b1234b..0000000000000 --- a/rllib_contrib/td3/tuned_examples/pendulum-td3-fake-gpus.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -pendulum-td3-fake-gpus: - env: Pendulum-v1 - run: TD3 - stop: - sampler_results/episode_reward_mean: -900 - timesteps_total: 100000 - config: - # Works for both torch and tf. - framework: torch - actor_hiddens: [64, 64] - critic_hiddens: [64, 64] - - replay_buffer_config: - type: MultiAgentReplayBuffer - num_steps_sampled_before_learning_starts: 5000 - exploration_config: - random_timesteps: 5000 - evaluation_interval: 10 - evaluation_duration: 5 - - # Fake 2 GPUs. - num_gpus: 2 - _fake_gpus: true diff --git a/rllib_contrib/td3/tuned_examples/pendulum-td3.yaml b/rllib_contrib/td3/tuned_examples/pendulum-td3.yaml deleted file mode 100644 index 413f8ad307c5f..0000000000000 --- a/rllib_contrib/td3/tuned_examples/pendulum-td3.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Run this experiment by doing: -# $ rllib train file [this very file] - -# This configuration can expect to reach -160 reward in 10k-20k timesteps -pendulum-td3: - env: Pendulum-v1 - run: TD3 - stop: - sampler_results/episode_reward_mean: -900 - timesteps_total: 100000 - config: - # Works for both torch and tf. - framework: torch - # === Model === - actor_hiddens: [64, 64] - critic_hiddens: [64, 64] - # === Exploration === - replay_buffer_config: - type: MultiAgentReplayBuffer - num_steps_sampled_before_learning_starts: 5000 - exploration_config: - random_timesteps: 5000