[RLlib; Offline RL] CQL: Support multi-GPU/CPU setup and different le…

…arning rates for actor, critic, and alpha. (ray-project#47402) Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
ujjawal-khare-27 · Oct 15, 2024 · 0ab8e84 · 0ab8e84
1 parent 93c97db
commit 0ab8e84
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 25 deletions.
diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py
@@ -89,23 +89,10 @@ def __init__(self, algo_class=None):
         # Note, the new stack defines learning rates for each component.
         # The base learning rate `lr` has to be set to `None`, if using
         # the new stack.
-        self.actor_lr = 1e-4
+        self.actor_lr = 1e-4,
         self.critic_lr = 1e-3
         self.alpha_lr = 1e-3
 
-        self.replay_buffer_config = {
-            "_enable_replay_buffer_api": True,
-            "type": "MultiAgentPrioritizedReplayBuffer",
-            "capacity": int(1e6),
-            # If True prioritized replay buffer will be used.
-            "prioritized_replay": False,
-            "prioritized_replay_alpha": 0.6,
-            "prioritized_replay_beta": 0.4,
-            "prioritized_replay_eps": 1e-6,
-            # Whether to compute priorities already on the remote worker side.
-            "worker_side_prioritization": False,
-        }
-
         # Changes to Algorithm's/SACConfig's default:
 
         # `.api_stack()`

diff --git a/rllib/algorithms/cql/torch/cql_torch_rl_module.py b/rllib/algorithms/cql/torch/cql_torch_rl_module.py
@@ -42,14 +42,17 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]:
         # First for the random actions (from the mu-distribution as named by Kumar et
         # al. (2020)).
         low = torch.tensor(
-            self.action_space.low,
+            self.config.action_space.low,
             device=fwd_out[QF_PREDS].device,
         )
         high = torch.tensor(
-            self.action_space.high,
+            self.config.action_space.high,
             device=fwd_out[QF_PREDS].device,
         )
-        num_samples = batch[Columns.ACTIONS].shape[0] * self.model_config["num_actions"]
+        num_samples = (
+            batch[Columns.ACTIONS].shape[0]
+            * self.config.model_config_dict["num_actions"]
+        )
         actions_rand_repeat = low + (high - low) * torch.rand(
             (num_samples, low.shape[0]), device=fwd_out[QF_PREDS].device
         )
@@ -125,7 +128,7 @@ def _repeat_actions(
     ) -> Dict[str, TensorType]:
         """Generated actions and Q-values for repeated observations.
 
-        The `self.model_config["num_actions"]` define a multiplier
+        The `self.config.model_condfig_dict["num_actions"]` define a multiplier
         used for generating `num_actions` as many actions as the batch size.
         Observations are repeated and then a model forward pass is made.
 
@@ -142,7 +145,7 @@ def _repeat_actions(
         # Receive the batch size.
         batch_size = obs.shape[0]
         # Receive the number of action to sample.
-        num_actions = self.model_config["num_actions"]
+        num_actions = self.config.model_config_dict["num_actions"]
         # Repeat the observations `num_actions` times.
         obs_repeat = tree.map_structure(
             lambda t: self._repeat_tensor(t, num_actions), obs

diff --git a/rllib/tuned_examples/cql/pendulum_cql.py b/rllib/tuned_examples/cql/pendulum_cql.py
@@ -62,23 +62,21 @@
         bc_iters=200,
         tau=9.5e-3,
         min_q_weight=5.0,
-        train_batch_size_per_learner=1024,
+        train_batch_size_per_learner=2048,
         twin_q=True,
         actor_lr=1.7e-3 * (args.num_gpus or 1) ** 0.5,
         critic_lr=2.5e-3 * (args.num_gpus or 1) ** 0.5,
         alpha_lr=1e-3 * (args.num_gpus or 1) ** 0.5,
-        # Set this to `None` for all `SAC`-like algorithms. These
-        # algorithms use learning rates for each optimizer.
         lr=None,
     )
     .reporting(
         min_time_s_per_iteration=10,
         metrics_num_episodes_for_smoothing=5,
     )
     .evaluation(
-        evaluation_interval=3,
-        evaluation_num_env_runners=1,
-        evaluation_duration=5,
+        evaluation_interval=1,
+        evaluation_num_env_runners=0,
+        evaluation_duration=10,
         evaluation_config={
             "explore": False,
         },