[RLlib] Discontinue support for "hybrid" API stack (using RLModule + …

…Learner, but still on RolloutWorker and Policy) (ray-project#46085) Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
ujjawal-khare-27 · Oct 15, 2024 · f33434b · f33434b
1 parent e409873
commit f33434b
Show file tree

Hide file tree

Showing 42 changed files with 451 additions and 237 deletions.
diff --git a/doc/source/rllib/doc_code/training.py b/doc/source/rllib/doc_code/training.py
@@ -38,6 +38,7 @@
     .api_stack(
         enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False
     )
+    .framework("torch")
     .environment("CartPole-v1")
     .env_runners(num_env_runners=0)
     .training(
@@ -112,6 +113,7 @@
     .api_stack(
         enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False
     )
+    .framework("torch")
     .environment("CartPole-v1")
     .training(
         replay_buffer_config={

diff --git a/doc/source/rllib/new-api-stack-migration-guide.rst b/doc/source/rllib/new-api-stack-migration-guide.rst
@@ -213,10 +213,16 @@ This method isn't used on the old API stack because the old stack doesn't use Le
 
 It allows you to specify:
 
-1) the number of `Learner` workers through `.learners(num_learners=...)`.
-1) the resources per learner; use `.learners(num_gpus_per_learner=1)` for GPU training and `.learners(num_gpus_per_learner=0)` for CPU training.
-1) the custom Learner class you want to use (`example on how to do this here <https://github.com/ray-project/ray/blob/master/rllib/examples/learners/custom_loss_fn_simple.py>`__)
-1) a config dict you would like to set for your custom learner: `.learners(learner_config_dict={...})`. Note that every `Learner` has access to the entire `AlgorithmConfig` object through `self.config`, but setting the `learner_config_dict` is a convenient way to avoid having to create an entirely new `AlgorithmConfig` subclass only to support a few extra settings for your custom `Learner` class.
+#. the number of `Learner` workers through `.learners(num_learners=...)`.
+#. the resources per learner; use `.learners(num_gpus_per_learner=1)` for GPU training
+   and `.learners(num_gpus_per_learner=0)` for CPU training.
+#. the custom Learner class you want to use (`example on how to do this here <https://github.com/ray-project/ray/blob/master/rllib/examples/learners/custom_loss_fn_simple.py>`__)
+#. a config dict you would like to set for your custom learner:
+   `.learners(learner_config_dict={...})`. Note that every `Learner` has access to the
+   entire `AlgorithmConfig` object through `self.config`, but setting the
+   `learner_config_dict` is a convenient way to avoid having to create an entirely new
+   `AlgorithmConfig` subclass only to support a few extra settings for your custom
+   `Learner` class.
 
 
 AlgorithmConfig.env_runners()
@@ -380,9 +386,11 @@ and `how to write a custom LSTM-containing RL Module <https://github.com/ray-pro
 There are various options for translating an existing, custom :py:class:`~ray.rllib.models.modelv2.ModelV2` from the old API stack,
 to the new API stack's :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`:
 
-1) Move your ModelV2 code to a new, custom `RLModule` class. See :ref:`RL Modules <rlmodule-guide>` for details).
-1) Use an Algorithm checkpoint or a Policy checkpoint that you have from an old API stack training run and use this checkpoint with the `new stack RL Module convenience wrapper <https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py>`__.
-1) Use an existing :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` object from an old API stack training run, with the `new stack RL Module convenience wrapper <https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py>`__.
+#. Move your ModelV2 code to a new, custom `RLModule` class. See :ref:`RL Modules <rlmodule-guide>` for details).
+#. Use an Algorithm checkpoint or a Policy checkpoint that you have from an old API stack
+   training run and use this checkpoint with the `new stack RL Module convenience wrapper <https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py>`__.
+#. Use an existing :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig`
+   object from an old API stack training run, with the `new stack RL Module convenience wrapper <https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py>`__.
 
 
 Custom loss functions and policies
@@ -423,7 +431,7 @@ The :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` documentation is
 The following are some examples on how to write ConnectorV2 pieces for the
 different pipelines:
 
-1) `Observation frame-stacking <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/frame_stacking.py>`__.
-1) `Add the most recent action and reward to the RL Module's input <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/prev_actions_prev_rewards.py>`__.
-1) `Mean-std filtering on all observations <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/mean_std_filtering.py>`__.
-1) `Flatten any complex observation space to a 1D space <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/flatten_observations_dict_space.py>`__.
+#. `Observation frame-stacking <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/frame_stacking.py>`__.
+#. `Add the most recent action and reward to the RL Module's input <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/prev_actions_prev_rewards.py>`__.
+#. `Mean-std filtering on all observations <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/mean_std_filtering.py>`__.
+#. `Flatten any complex observation space to a 1D space <https://github.com/ray-project/ray/blob/master/rllib/examples/connectors/flatten_observations_dict_space.py>`__.
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -2887,6 +2887,107 @@ py_test(
     args = ["--enable-new-api-stack", "--as-test"]
 )
 
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_local_cpu_torch",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=torch", "--config=local-cpu"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_local_cpu_tf2",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=tf2", "--config=local-cpu"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_local_gpu_torch",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples", "gpu"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=torch", "--config=local-gpu"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_local_gpu_tf2",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples", "gpu", "exclusive"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=tf2", "--config=local-gpu"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_remote_cpu_torch",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=torch", "--config=remote-cpu"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_remote_cpu_tf2",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=tf2", "--config=remote-cpu"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_remote_gpu_torch",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples", "gpu", "exclusive"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=torch", "--config=remote-gpu"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_remote_gpu_tf2",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples", "gpu", "exclusive"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=tf2", "--config=remote-gpu"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_multi_gpu_torch",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples", "multi_gpu", "exclusive"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=torch", "--config=multi-gpu-ddp"]
+)
+
+#@OldAPIStack @HybridAPIStack
+py_test(
+    name = "examples/learners/ppo_tuner_multi_gpu_tf2",
+    main = "examples/learners/ppo_tuner.py",
+    tags = ["team:rllib", "examples", "multi_gpu", "exclusive"],
+    size = "medium",
+    srcs = ["examples/learners/ppo_tuner.py"],
+    args = ["--framework=tf2", "--config=multi-gpu-ddp"]
+)
+
 # subdirectory: multi_agent/
 # ....................................
 py_test(

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
@@ -817,19 +817,36 @@ def setup(self, config: AlgorithmConfig) -> None:
                     rl_module_ckpt_dirs=rl_module_ckpt_dirs,
                 )
 
-            # Sync the weights from the learner group to the EnvRunners.
-            rl_module_state = self.learner_group.get_state(
-                components=COMPONENT_LEARNER + "/" + COMPONENT_RL_MODULE,
-                inference_only=True,
-            )[COMPONENT_LEARNER][COMPONENT_RL_MODULE]
-            self.env_runner.set_state({COMPONENT_RL_MODULE: rl_module_state})
-            self.env_runner_group.sync_env_runner_states(
-                config=self.config,
-                env_steps_sampled=self.metrics.peek(
-                    NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
-                ),
-                rl_module_state=rl_module_state,
-            )
+            # Only when using RolloutWorkers: Update also the worker set's
+            # `is_policy_to_train`.
+            # Note that with the new EnvRunner API in combination with the new stack,
+            # this information only needs to be kept in the Learner and not on the
+            # EnvRunners anymore.
+            if not self.config.enable_env_runner_and_connector_v2:
+                policies_to_train = self.config.policies_to_train or set(
+                    self.config.policies
+                )
+                self.env_runner_group.foreach_worker(
+                    lambda w: w.set_is_policy_to_train(policies_to_train),
+                )
+                # Sync the weights from the learner group to the rollout workers.
+                self.env_runner.set_weights(self.learner_group.get_weights())
+                self.env_runner_group.sync_weights(inference_only=True)
+            # New stack/EnvRunner APIs: Use get/set_state.
+            else:
+                # Sync the weights from the learner group to the EnvRunners.
+                rl_module_state = self.learner_group.get_state(
+                    components=COMPONENT_LEARNER + "/" + COMPONENT_RL_MODULE,
+                    inference_only=True,
+                )[COMPONENT_LEARNER][COMPONENT_RL_MODULE]
+                self.env_runner.set_state({COMPONENT_RL_MODULE: rl_module_state})
+                self.env_runner_group.sync_env_runner_states(
+                    config=self.config,
+                    env_steps_sampled=self.metrics.peek(
+                        NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0
+                    ),
+                    rl_module_state=rl_module_state,
+                )
 
             if self.offline_data:
                 # If the learners are remote we need to provide specific

diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py
@@ -107,6 +107,12 @@ def __init__(self, algo_class=None):
         }
 
         # Changes to Algorithm's/SACConfig's default:
+
+        # `.api_stack()`
+        self.api_stack(
+            enable_rl_module_and_learner=False,
+            enable_env_runner_and_connector_v2=False,
+        )
         # .reporting()
         self.min_sample_timesteps_per_iteration = 0
         self.min_train_timesteps_per_iteration = 100

diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py
@@ -424,23 +424,15 @@ def validate(self) -> None:
         # Call super's validation method.
         super().validate()
 
-        # Disallow hybrid API stack for DQN/SAC.
+        # Warn about new API stack on by default.
         if self.enable_rl_module_and_learner:
-            if not self.enable_env_runner_and_connector_v2:
-                raise ValueError(
-                    "Hybrid API stack (`enable_rl_module_and_learner=True` and "
-                    "`enable_env_runner_and_connector_v2=False`) no longer supported "
-                    "for DQN! Set both to True (recommended new API stack) or both to "
-                    "False (old API stack)."
-                )
-            else:
-                logger.warning(
-                    "You are running DQN on the new API stack! This is the new default "
-                    "behavior for this algorithm. If you don't want to use the new API "
-                    "stack, set `config.api_stack(enable_rl_module_and_learner=False, "
-                    "enable_env_runner_and_connector_v2=False)`. For a detailed "
-                    "migration guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.html"  # noqa
-                )
+            logger.warning(
+                "You are running DQN on the new API stack! This is the new default "
+                "behavior for this algorithm. If you don't want to use the new API "
+                "stack, set `config.api_stack(enable_rl_module_and_learner=False, "
+                "enable_env_runner_and_connector_v2=False)`. For a detailed "
+                "migration guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.html"  # noqa
+            )
 
         if (
             not self.enable_rl_module_and_learner

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
@@ -532,15 +532,7 @@ def _training_step_old_api_stack(self) -> ResultDict:
             # Standardize advantages.
             train_batch = standardize_fields(train_batch, ["advantages"])
 
-        # Perform a train step on the collected batch.
-        if self.config.enable_rl_module_and_learner:
-            train_results = self.learner_group.update_from_batch(
-                batch=train_batch,
-                minibatch_size=self.config.minibatch_size,
-                num_epochs=self.config.num_epochs,
-            )
-
-        elif self.config.simple_optimizer:
+        if self.config.simple_optimizer:
             train_results = train_one_step(self, train_batch)
         else:
             train_results = multi_gpu_train_one_step(self, train_batch)

diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py
@@ -130,6 +130,52 @@ def test_ppo_compilation_and_schedule_mixins(self):
                 # algo.evaluate()
                 algo.stop()
 
+    def test_ppo_free_log_std(self):
+        """Tests the free log std option works."""
+        config = (
+            ppo.PPOConfig()
+            .api_stack(
+                enable_rl_module_and_learner=True,
+                enable_env_runner_and_connector_v2=True,
+            )
+            .environment("Pendulum-v1")
+            .env_runners(
+                num_env_runners=1,
+            )
+            .rl_module(
+                model_config_dict={
+                    "fcnet_hiddens": [10],
+                    "fcnet_activation": "linear",
+                    "free_log_std": True,
+                    "vf_share_layers": True,
+                }
+            )
+            .training(
+                gamma=0.99,
+            )
+        )
+
+        algo = config.build()
+        module = algo.get_module(DEFAULT_MODULE_ID)
+
+        # Check the free log std var is created.
+        matching = [v for (n, v) in module.named_parameters() if "log_std" in n]
+        assert len(matching) == 1, matching
+        log_std_var = matching[0]
+
+        def get_value(log_std_var=log_std_var):
+            return log_std_var.detach().cpu().numpy()[0]
+
+        # Check the variable is initially zero.
+        init_std = get_value()
+        assert init_std == 0.0, init_std
+        algo.train()
+
+        # Check the variable is updated.
+        post_std = get_value()
+        assert post_std != 0.0, post_std
+        algo.stop()
+
 
 if __name__ == "__main__":
     import pytest

diff --git a/rllib/algorithms/ppo/tests/test_ppo_old_api_stack.py b/rllib/algorithms/ppo/tests/test_ppo_old_api_stack.py
@@ -144,7 +144,6 @@ def test_ppo_compilation_w_connectors(self):
                 num_env_runners=1,
                 # Test with compression.
                 compress_observations=True,
-                enable_connectors=True,
             )
             .callbacks(MyCallbacks)
             .evaluation(

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -30,14 +30,32 @@ def _forward(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         return output
 
     @override(RLModule)
-    def _forward_train(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]:
-        """Train forward pass (keep embeddings for possible shared value func. call)."""
+    def _forward_exploration(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return self._forward_inference(batch)
+
+    @override(RLModule)
+    def _forward_train(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        if self.config.inference_only:
+            raise RuntimeError(
+                "Trying to train a module that is not a learner module. Set the "
+                "flag `inference_only=False` when building the module."
+            )
         output = {}
+
+        # Shared encoder.
         encoder_outs = self.encoder(batch)
-        output[Columns.EMBEDDINGS] = encoder_outs[ENCODER_OUT][CRITIC]
         if Columns.STATE_OUT in encoder_outs:
             output[Columns.STATE_OUT] = encoder_outs[Columns.STATE_OUT]
-        output[Columns.ACTION_DIST_INPUTS] = self.pi(encoder_outs[ENCODER_OUT][ACTOR])
+
+        # Value head.
+        vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC])
+        # Squeeze out last dim (value function node).
+        output[Columns.VF_PREDS] = vf_out.squeeze(-1)
+
+        # Policy head.
+        action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR])
+        output[Columns.ACTION_DIST_INPUTS] = action_logits
+
         return output
 
     @override(ValueFunctionAPI)