Skip to content

Commit

Permalink
[RLlib; Offline RL] CQL: Support multi-GPU/CPU setup and different le…
Browse files Browse the repository at this point in the history
…arning rates for actor, critic, and alpha. (ray-project#47402)

Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
  • Loading branch information
simonsays1980 authored and ujjawal-khare committed Oct 15, 2024
1 parent 93c97db commit 0ab8e84
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 25 deletions.
15 changes: 1 addition & 14 deletions rllib/algorithms/cql/cql.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,23 +89,10 @@ def __init__(self, algo_class=None):
# Note, the new stack defines learning rates for each component.
# The base learning rate `lr` has to be set to `None`, if using
# the new stack.
self.actor_lr = 1e-4
self.actor_lr = 1e-4,
self.critic_lr = 1e-3
self.alpha_lr = 1e-3

self.replay_buffer_config = {
"_enable_replay_buffer_api": True,
"type": "MultiAgentPrioritizedReplayBuffer",
"capacity": int(1e6),
# If True prioritized replay buffer will be used.
"prioritized_replay": False,
"prioritized_replay_alpha": 0.6,
"prioritized_replay_beta": 0.4,
"prioritized_replay_eps": 1e-6,
# Whether to compute priorities already on the remote worker side.
"worker_side_prioritization": False,
}

# Changes to Algorithm's/SACConfig's default:

# `.api_stack()`
Expand Down
13 changes: 8 additions & 5 deletions rllib/algorithms/cql/torch/cql_torch_rl_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,17 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]:
# First for the random actions (from the mu-distribution as named by Kumar et
# al. (2020)).
low = torch.tensor(
self.action_space.low,
self.config.action_space.low,
device=fwd_out[QF_PREDS].device,
)
high = torch.tensor(
self.action_space.high,
self.config.action_space.high,
device=fwd_out[QF_PREDS].device,
)
num_samples = batch[Columns.ACTIONS].shape[0] * self.model_config["num_actions"]
num_samples = (
batch[Columns.ACTIONS].shape[0]
* self.config.model_config_dict["num_actions"]
)
actions_rand_repeat = low + (high - low) * torch.rand(
(num_samples, low.shape[0]), device=fwd_out[QF_PREDS].device
)
Expand Down Expand Up @@ -125,7 +128,7 @@ def _repeat_actions(
) -> Dict[str, TensorType]:
"""Generated actions and Q-values for repeated observations.
The `self.model_config["num_actions"]` define a multiplier
The `self.config.model_condfig_dict["num_actions"]` define a multiplier
used for generating `num_actions` as many actions as the batch size.
Observations are repeated and then a model forward pass is made.
Expand All @@ -142,7 +145,7 @@ def _repeat_actions(
# Receive the batch size.
batch_size = obs.shape[0]
# Receive the number of action to sample.
num_actions = self.model_config["num_actions"]
num_actions = self.config.model_config_dict["num_actions"]
# Repeat the observations `num_actions` times.
obs_repeat = tree.map_structure(
lambda t: self._repeat_tensor(t, num_actions), obs
Expand Down
10 changes: 4 additions & 6 deletions rllib/tuned_examples/cql/pendulum_cql.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,23 +62,21 @@
bc_iters=200,
tau=9.5e-3,
min_q_weight=5.0,
train_batch_size_per_learner=1024,
train_batch_size_per_learner=2048,
twin_q=True,
actor_lr=1.7e-3 * (args.num_gpus or 1) ** 0.5,
critic_lr=2.5e-3 * (args.num_gpus or 1) ** 0.5,
alpha_lr=1e-3 * (args.num_gpus or 1) ** 0.5,
# Set this to `None` for all `SAC`-like algorithms. These
# algorithms use learning rates for each optimizer.
lr=None,
)
.reporting(
min_time_s_per_iteration=10,
metrics_num_episodes_for_smoothing=5,
)
.evaluation(
evaluation_interval=3,
evaluation_num_env_runners=1,
evaluation_duration=5,
evaluation_interval=1,
evaluation_num_env_runners=0,
evaluation_duration=10,
evaluation_config={
"explore": False,
},
Expand Down

0 comments on commit 0ab8e84

Please sign in to comment.