diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py index b16f67264234e..8bb1b74ab1df8 100644 --- a/rllib/algorithms/cql/cql.py +++ b/rllib/algorithms/cql/cql.py @@ -89,23 +89,10 @@ def __init__(self, algo_class=None): # Note, the new stack defines learning rates for each component. # The base learning rate `lr` has to be set to `None`, if using # the new stack. - self.actor_lr = 1e-4 + self.actor_lr = 1e-4, self.critic_lr = 1e-3 self.alpha_lr = 1e-3 - self.replay_buffer_config = { - "_enable_replay_buffer_api": True, - "type": "MultiAgentPrioritizedReplayBuffer", - "capacity": int(1e6), - # If True prioritized replay buffer will be used. - "prioritized_replay": False, - "prioritized_replay_alpha": 0.6, - "prioritized_replay_beta": 0.4, - "prioritized_replay_eps": 1e-6, - # Whether to compute priorities already on the remote worker side. - "worker_side_prioritization": False, - } - # Changes to Algorithm's/SACConfig's default: # `.api_stack()` diff --git a/rllib/algorithms/cql/torch/cql_torch_rl_module.py b/rllib/algorithms/cql/torch/cql_torch_rl_module.py index 69eced51d2627..8edb5fcf5c324 100644 --- a/rllib/algorithms/cql/torch/cql_torch_rl_module.py +++ b/rllib/algorithms/cql/torch/cql_torch_rl_module.py @@ -42,14 +42,17 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]: # First for the random actions (from the mu-distribution as named by Kumar et # al. (2020)). low = torch.tensor( - self.action_space.low, + self.config.action_space.low, device=fwd_out[QF_PREDS].device, ) high = torch.tensor( - self.action_space.high, + self.config.action_space.high, device=fwd_out[QF_PREDS].device, ) - num_samples = batch[Columns.ACTIONS].shape[0] * self.model_config["num_actions"] + num_samples = ( + batch[Columns.ACTIONS].shape[0] + * self.config.model_config_dict["num_actions"] + ) actions_rand_repeat = low + (high - low) * torch.rand( (num_samples, low.shape[0]), device=fwd_out[QF_PREDS].device ) @@ -125,7 +128,7 @@ def _repeat_actions( ) -> Dict[str, TensorType]: """Generated actions and Q-values for repeated observations. - The `self.model_config["num_actions"]` define a multiplier + The `self.config.model_condfig_dict["num_actions"]` define a multiplier used for generating `num_actions` as many actions as the batch size. Observations are repeated and then a model forward pass is made. @@ -142,7 +145,7 @@ def _repeat_actions( # Receive the batch size. batch_size = obs.shape[0] # Receive the number of action to sample. - num_actions = self.model_config["num_actions"] + num_actions = self.config.model_config_dict["num_actions"] # Repeat the observations `num_actions` times. obs_repeat = tree.map_structure( lambda t: self._repeat_tensor(t, num_actions), obs diff --git a/rllib/tuned_examples/cql/pendulum_cql.py b/rllib/tuned_examples/cql/pendulum_cql.py index 24e74f0781a7b..b8b5f2affbedc 100644 --- a/rllib/tuned_examples/cql/pendulum_cql.py +++ b/rllib/tuned_examples/cql/pendulum_cql.py @@ -62,13 +62,11 @@ bc_iters=200, tau=9.5e-3, min_q_weight=5.0, - train_batch_size_per_learner=1024, + train_batch_size_per_learner=2048, twin_q=True, actor_lr=1.7e-3 * (args.num_gpus or 1) ** 0.5, critic_lr=2.5e-3 * (args.num_gpus or 1) ** 0.5, alpha_lr=1e-3 * (args.num_gpus or 1) ** 0.5, - # Set this to `None` for all `SAC`-like algorithms. These - # algorithms use learning rates for each optimizer. lr=None, ) .reporting( @@ -76,9 +74,9 @@ metrics_num_episodes_for_smoothing=5, ) .evaluation( - evaluation_interval=3, - evaluation_num_env_runners=1, - evaluation_duration=5, + evaluation_interval=1, + evaluation_num_env_runners=0, + evaluation_duration=10, evaluation_config={ "explore": False, },