diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst index 039ebdae0c5a..0a776648322d 100644 --- a/doc/source/rllib/rllib-algorithms.rst +++ b/doc/source/rllib/rllib-algorithms.rst @@ -26,7 +26,7 @@ as well as multi-GPU training on multi-node (GPU) clusters when using the `Anysc +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ | :ref:`DQN/Rainbow (Deep Q Networks) ` | |single_agent| |multi_agent| | |multi_gpu| |multi_node_multi_gpu| | |discr_actions| | +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ -| :ref:`SAC (Soft Actor Critic) ` | |single_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions| | +| :ref:`SAC (Soft Actor Critic) ` | |single_agent| |multi_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions| | +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ | **High-throughput on- and off policy** | +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+ diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 26165cb40852..a91f901a6c53 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -2790,6 +2790,42 @@ cluster: cluster_compute: 8gpus_96cpus_gce.yaml + +# -------------------------- +# SAC +# -------------------------- +- name: rllib_learning_tests_halfcheetah_sac_torch + group: RLlib tests + working_dir: rllib_tests + + stable: true + + frequency: nightly + team: rllib + cluster: + byod: + type: gpu + post_build_script: byod_rllib_test.sh + runtime_env: + - RLLIB_TEST_NO_JAX_IMPORT=1 + - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin + cluster_compute: 4gpus_64cpus.yaml + + run: + timeout: 7200 + script: python learning_tests/tuned_examples/sac/halfcheetah_sac.py --enable-new-api-stack --num-gpus=4 --num-env-runners=8 --stop-reward=1000.0 --as-release-test + + alert: default + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: 4gpus_64cpus_gce.yaml + + ######################## # Core Nightly Tests ######################## diff --git a/rllib/BUILD b/rllib/BUILD index 6b8a48f36e3e..c74e9dfb4c48 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -349,6 +349,30 @@ py_test( srcs = ["tuned_examples/dqn/cartpole_dqn.py"], args = ["--as-test", "--enable-new-api-stack"] ) +py_test( + name = "learning_tests_cartpole_dqn_gpu", + main = "tuned_examples/dqn/cartpole_dqn.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"], + size = "large", + srcs = ["tuned_examples/dqn/cartpole_dqn.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"] +) +py_test( + name = "learning_tests_cartpole_dqn_multi_cpu", + main = "tuned_examples/dqn/cartpole_dqn.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], + size = "large", + srcs = ["tuned_examples/dqn/cartpole_dqn.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] +) +py_test( + name = "learning_tests_cartpole_dqn_multi_gpu", + main = "tuned_examples/dqn/cartpole_dqn.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], + size = "large", + srcs = ["tuned_examples/dqn/cartpole_dqn.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] +) # MultiAgentCartPole py_test( name = "learning_tests_multi_agent_cartpole_dqn", @@ -358,16 +382,29 @@ py_test( srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"], args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4"] ) - -#@OldAPIStack py_test( - name = "learning_tests_cartpole_dqn_softq_old_api_stack", - main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete"], - size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer - srcs = ["tests/run_regression_tests.py"], - data = ["tuned_examples/dqn/cartpole-dqn-softq.yaml"], - args = ["--dir=tuned_examples/dqn"] + name = "learning_tests_multi_agent_cartpole_dqn_gpu", + main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"], + size = "large", + srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=1"] +) +py_test( + name = "learning_tests_multi_agent_cartpole_dqn_multi_cpu", + main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], + size = "large", + srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=5", "--num-gpus=2"] +) +py_test( + name = "learning_tests_multi_agent_cartpole_dqn_multi_gpu", + main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], + size = "large", + srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=2"] ) # IMPALA @@ -669,7 +706,31 @@ py_test( srcs = ["tuned_examples/sac/pendulum_sac.py"], args = ["--as-test", "--enable-new-api-stack"] ) - +py_test( + name = "learning_tests_pendulum_sac_gpu", + main = "tuned_examples/sac/pendulum_sac.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "gpu"], + size = "large", + srcs = ["tuned_examples/sac/pendulum_sac.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"] +) +py_test( + name = "learning_tests_pendulum_sac_multi_cpu", + main = "tuned_examples/sac/pendulum_sac.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous"], + size = "large", + srcs = ["tuned_examples/sac/pendulum_sac.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] +) +py_test( + name = "learning_tests_pendulum_sac_multi_gpu", + main = "tuned_examples/sac/pendulum_sac.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "multi_gpu"], + size = "large", + srcs = ["tuned_examples/sac/pendulum_sac.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] +) +# MultiAgentPendulum py_test( name = "learning_tests_multi_agent_pendulum_sac", main = "tuned_examples/sac/multi_agent_pendulum_sac.py", @@ -678,7 +739,22 @@ py_test( srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"], args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4"] ) - +py_test( + name = "learning_tests_multi_agent_pendulum_sac_gpu", + main = "tuned_examples/sac/multi_agent_pendulum_sac.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "gpu"], + size = "large", + srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"], + args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=1"] +) +py_test( + name = "learning_tests_multi_agent_pendulum_sac_multi_cpu", + main = "tuned_examples/sac/multi_agent_pendulum_sac.py", + tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous"], + size = "large", + srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--num-gpus=2"] +) py_test( name = "learning_tests_multi_agent_pendulum_sac_multi_gpu", main = "tuned_examples/sac/multi_agent_pendulum_sac.py", @@ -3240,7 +3316,7 @@ py_test( name = "examples/rl_modules/custom_lstm_rl_module", main = "examples/rl_modules/custom_lstm_rl_module.py", tags = ["team:rllib", "examples"], - size = "medium", + size = "large", srcs = ["examples/rl_modules/custom_lstm_rl_module.py"], args = ["--as-test", "--enable-new-api-stack"], ) diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 39379d206839..95bb0ba23baf 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -109,9 +109,7 @@ ENV_RUNNER_RESULTS, ENV_RUNNER_SAMPLING_TIMER, EPISODE_LEN_MEAN, - EPISODE_RETURN_MAX, EPISODE_RETURN_MEAN, - EPISODE_RETURN_MIN, EVALUATION_ITERATION_TIMER, EVALUATION_RESULTS, FAULT_TOLERANCE_STATS, @@ -1701,7 +1699,7 @@ def training_step(self) -> ResultDict: if self.config.count_steps_by == "agent_steps": train_batch, env_runner_results = synchronous_parallel_sample( worker_set=self.env_runner_group, - max_agent_steps=self.config.train_batch_size, + max_agent_steps=self.config.total_train_batch_size, sample_timeout_s=self.config.sample_timeout_s, _uses_new_env_runners=( self.config.enable_env_runner_and_connector_v2 @@ -1711,7 +1709,7 @@ def training_step(self) -> ResultDict: else: train_batch, env_runner_results = synchronous_parallel_sample( worker_set=self.env_runner_group, - max_env_steps=self.config.train_batch_size, + max_env_steps=self.config.total_train_batch_size, sample_timeout_s=self.config.sample_timeout_s, _uses_new_env_runners=( self.config.enable_env_runner_and_connector_v2 @@ -3846,21 +3844,23 @@ def _compile_iteration_results_new_api_stack( # Return dict (shallow copy of `train_results`). results: ResultDict = train_results.copy() - # TODO (sven): Fix Tune, instead, to be tolerant against possibly missing result - # keys. Otherwise, we'll have to guess here, what "popular" keys users use in - # order to protect them from running into Tune KeyErrors. - if ENV_RUNNER_RESULTS not in results: - results[ENV_RUNNER_RESULTS] = {} - for must_have in [ - EPISODE_RETURN_MEAN, - EPISODE_RETURN_MIN, - EPISODE_RETURN_MAX, - ]: - if must_have not in results[ENV_RUNNER_RESULTS]: - results[ENV_RUNNER_RESULTS][must_have] = np.nan + # Collect old-API-stack-style `self._timers` results. + for k, timer in self._timers.items(): + if TIMERS not in results: + results[TIMERS] = {} + results[TIMERS]["{}_time_sec".format(k)] = timer.mean + if timer.has_units_processed(): + results[TIMERS]["{}_throughput".format(k)] = round( + timer.mean_throughput, 3 + ) # Evaluation results. if eval_results: + assert ( + isinstance(eval_results, dict) + and len(eval_results) == 1 + and EVALUATION_RESULTS in eval_results + ) results.update(eval_results) # Fault tolerance stats. results[FAULT_TOLERANCE_STATS] = { diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index b516b5a8f746..63e9aafd71ef 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -376,9 +376,9 @@ def __init__(self, algo_class: Optional[type] = None): self.lr = 0.001 self.grad_clip = None self.grad_clip_by = "global_norm" - self.train_batch_size = 32 # Simple logic for now: If None, use `train_batch_size`. self.train_batch_size_per_learner = None + self.train_batch_size = 32 # @OldAPIStack # TODO (sven): Unsolved problem with RLModules sometimes requiring settings from # the main AlgorithmConfig. We should not require the user to provide those # settings in both, the AlgorithmConfig (as property) AND the model config diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index 25f43e9a020e..2f2e21b18db5 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -58,6 +58,7 @@ NUM_MODULE_STEPS_TRAINED, NUM_MODULE_STEPS_TRAINED_LIFETIME, NUM_TARGET_UPDATES, + REPLAY_BUFFER_ADD_DATA_TIMER, REPLAY_BUFFER_SAMPLE_TIMER, REPLAY_BUFFER_UPDATE_PRIOS_TIMER, SAMPLE_TIMER, @@ -556,7 +557,7 @@ def calculate_rr_weights(config: AlgorithmConfig) -> List[float]: # This is to set freshly rollout-collected data in relation to # the data we pull from the replay buffer (which also contains old # samples). - native_ratio = config.train_batch_size / ( + native_ratio = config.total_train_batch_size / ( config.get_rollout_fragment_length() * config.num_envs_per_env_runner # Add one to workers because the local @@ -628,13 +629,15 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: _uses_new_env_runners=True, _return_metrics=True, ) - # Add the sampled experiences to the replay buffer. - self.local_replay_buffer.add(episodes) # Reduce EnvRunner metrics over the n EnvRunners. self.metrics.merge_and_log_n_dicts( env_runner_results, key=ENV_RUNNER_RESULTS ) + # Add the sampled experiences to the replay buffer. + with self.metrics.log_time((TIMERS, REPLAY_BUFFER_ADD_DATA_TIMER)): + self.local_replay_buffer.add(episodes) + self.metrics.log_dict( self.metrics.peek( (ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED), default={} @@ -684,7 +687,7 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: # Sample a list of episodes used for learning from the replay buffer. with self.metrics.log_time((TIMERS, REPLAY_BUFFER_SAMPLE_TIMER)): episodes = self.local_replay_buffer.sample( - num_items=self.config.train_batch_size, + num_items=self.config.total_train_batch_size, n_step=self.config.n_step, gamma=self.config.gamma, beta=self.config.replay_buffer_config.get("beta"), @@ -707,14 +710,16 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: # disk or WandB, they might be very large). td_errors = defaultdict(list) for res in learner_results: - for mid, m_res in res.items(): - if TD_ERROR_KEY in m_res: - td_errors[mid].extend( - convert_to_numpy(m_res.pop(TD_ERROR_KEY).peek()) + for module_id, module_results in res.items(): + if TD_ERROR_KEY in module_results: + td_errors[module_id].extend( + convert_to_numpy( + module_results.pop(TD_ERROR_KEY).peek() + ) ) td_errors = { - mid: {TD_ERROR_KEY: np.concatenate(s, axis=0)} - for mid, s in td_errors.items() + module_id: {TD_ERROR_KEY: np.concatenate(s, axis=0)} + for module_id, s in td_errors.items() } self.metrics.merge_and_log_n_dicts( learner_results, key=LEARNER_RESULTS @@ -812,7 +817,7 @@ def _training_step_old_and_hybrid_api_stack(self) -> ResultDict: # Sample training batch (MultiAgentBatch) from replay buffer. train_batch = sample_min_n_steps_from_buffer( self.local_replay_buffer, - self.config.train_batch_size, + self.config.total_train_batch_size, count_by_agent_steps=self.config.count_steps_by == "agent_steps", ) diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 2f3bc8d11489..f4fc852f3586 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -100,7 +100,8 @@ def __init__(self, algo_class=None): } # .training() - self.train_batch_size = 256 + self.train_batch_size_per_learner = 256 + self.train_batch_size = 256 # @OldAPIstack # Number of timesteps to collect from rollout workers before we start # sampling from replay buffers for learning. Whether we count this in agent # steps or environment steps depends on config.multi_agent(count_steps_by=..). diff --git a/rllib/algorithms/sac/sac_learner.py b/rllib/algorithms/sac/sac_learner.py index bcb18a25ae56..58703174742a 100644 --- a/rllib/algorithms/sac/sac_learner.py +++ b/rllib/algorithms/sac/sac_learner.py @@ -31,7 +31,11 @@ def build(self) -> None: self.curr_log_alpha: Dict[ModuleID, TensorType] = LambdaDefaultDict( lambda module_id: self._get_tensor_variable( # Note, we want to train the temperature parameter. - [np.log(self.config.get_config_for_module(module_id).initial_alpha)], + [ + np.log( + self.config.get_config_for_module(module_id).initial_alpha + ).astype(np.float32) + ], trainable=True, ) ) diff --git a/rllib/algorithms/sac/torch/sac_torch_learner.py b/rllib/algorithms/sac/torch/sac_torch_learner.py index 52e9b9ec8dda..f87d46f2e4b7 100644 --- a/rllib/algorithms/sac/torch/sac_torch_learner.py +++ b/rllib/algorithms/sac/torch/sac_torch_learner.py @@ -35,7 +35,7 @@ class SACTorchLearner(DQNRainbowTorchLearner, SACLearner): This ' Learner' class implements the loss in its `self.compute_loss_for_module()` method. In addition, it updates - target networks in its inherited method `_update_module_target_networks`. + the target networks of the RLModule(s). """ # TODO (simon): Set different learning rates for optimizers. @@ -109,73 +109,20 @@ def compute_loss_for_module( batch: Dict[str, Any], fwd_out: Dict[str, TensorType] ) -> TensorType: - # Only for debugging. - deterministic = config._deterministic_loss - # Receive the current alpha hyperparameter. alpha = torch.exp(self.curr_log_alpha[module_id]) - module = self.module[module_id].unwrapped() - - # Get the train action distribution for the current policy and current state. - # This is needed for the policy (actor) loss in SAC. - action_dist_class = module.get_train_action_dist_cls() - action_dist_curr = action_dist_class.from_logits( - fwd_out[Columns.ACTION_DIST_INPUTS] - ) - # Get the train action distribution for the current policy and next state. - # For the Q (critic) loss in SAC, we need to sample from the current policy at - # the next state. - action_dist_next = action_dist_class.from_logits( - fwd_out["action_dist_inputs_next"] - ) - - # Sample actions for the current state. Note that we need to apply the - # reparameterization trick here to avoid the expectation over actions. - actions_curr = ( - action_dist_curr.rsample() - if not deterministic - # If deterministic, we use the mean. - else action_dist_curr.to_deterministic().sample() - ) - # Compute the log probabilities for the current state (for the critic loss). - logps_curr = action_dist_curr.logp(actions_curr) - - # Sample actions for the next state. - actions_next = ( - action_dist_next.sample() - if not deterministic - # If deterministic, we use the mean. - else action_dist_next.to_deterministic().sample() - ) - # Compute the log probabilities for the next state. - logps_next = action_dist_next.logp(actions_next) - # Get Q-values for the actually selected actions during rollout. # In the critic loss we use these as predictions. q_selected = fwd_out[QF_PREDS] if config.twin_q: q_twin_selected = fwd_out[QF_TWIN_PREDS] - # Compute Q-values for the current policy in the current state with - # the sampled actions. - q_batch_curr = { - Columns.OBS: batch[Columns.OBS], - Columns.ACTIONS: actions_curr, - } - q_curr = module.compute_q_values(q_batch_curr) - - # Compute Q-values from the target Q network for the next state with the - # sampled actions for the next state. - q_batch_next = { - Columns.OBS: batch[Columns.NEXT_OBS], - Columns.ACTIONS: actions_next, - } - q_target_next = module.forward_target(q_batch_next) - # Compute value function for next state (see eq. (3) in Haarnoja et al. (2018)). # Note, we use here the sampled actions in the log probabilities. - q_target_next -= alpha * logps_next + q_target_next = ( + fwd_out["q_target_next"] - alpha.detach() * fwd_out["logp_next_resampled"] + ) # Now mask all Q-values with terminated next states in the targets. q_next_masked = (1.0 - batch[Columns.TERMINATEDS].float()) * q_target_next @@ -215,10 +162,16 @@ def compute_loss_for_module( ) # For the actor (policy) loss we need sampled actions from the current policy - # evaluated at the current state. + # evaluated at the current observations. + # Note that the `q_curr` tensor below has the q-net's gradients ignored, while + # having the policy's gradients registered. The policy net was used to rsample + # actions used to compute `q_curr` (by passing these actions through the q-net). + # Hence, we can't do `fwd_out[q_curr].detach()`! # Note further, we minimize here, while the original equation in Haarnoja et # al. (2018) considers maximization. - actor_loss = torch.mean(alpha.detach() * logps_curr - q_curr) + actor_loss = torch.mean( + alpha.detach() * fwd_out["logp_resampled"] - fwd_out["q_curr"] + ) # Optimize also the hyperparameter alpha by using the current policy # evaluated at the current state (sampled values). @@ -226,7 +179,7 @@ def compute_loss_for_module( # to optimize and monotonic function. Original equation uses alpha. alpha_loss = -torch.mean( self.curr_log_alpha[module_id] - * (logps_curr.detach() + self.target_entropy[module_id]) + * (fwd_out["logp_resampled"].detach() + self.target_entropy[module_id]) ) total_loss = actor_loss + critic_loss + alpha_loss @@ -254,11 +207,10 @@ def compute_loss_for_module( "alpha_value": alpha, "log_alpha_value": torch.log(alpha), "target_entropy": self.target_entropy[module_id], - "actions_curr_policy": torch.mean(actions_curr), - LOGPS_KEY: torch.mean(logps_curr), - QF_MEAN_KEY: torch.mean(q_curr), - QF_MAX_KEY: torch.max(q_curr), - QF_MIN_KEY: torch.min(q_curr), + LOGPS_KEY: torch.mean(fwd_out["logp_resampled"]), + QF_MEAN_KEY: torch.mean(fwd_out["q_curr"]), + QF_MAX_KEY: torch.max(fwd_out["q_curr"]), + QF_MIN_KEY: torch.min(fwd_out["q_curr"]), TD_ERROR_MEAN_KEY: torch.mean(td_error), }, key=module_id, @@ -294,11 +246,9 @@ def compute_gradients( retain_graph=True ) # Store the gradients for the component and module. - # TODO (simon): Check another time the graph for overlapping - # gradients. grads.update( { - pid: p.grad.clone() + pid: p.grad for pid, p in self.filter_param_dict_for_optimizer( self._params, optim ).items() diff --git a/rllib/algorithms/sac/torch/sac_torch_rl_module.py b/rllib/algorithms/sac/torch/sac_torch_rl_module.py index 61d920609203..957e6a9ebf32 100644 --- a/rllib/algorithms/sac/torch/sac_torch_rl_module.py +++ b/rllib/algorithms/sac/torch/sac_torch_rl_module.py @@ -6,11 +6,11 @@ QF_TWIN_PREDS, ) from ray.rllib.algorithms.sac.sac_rl_module import SACRLModule +from ray.rllib.core.columns import Columns from ray.rllib.core.models.base import ENCODER_OUT, Encoder, Model from ray.rllib.core.rl_module.apis.target_network_api import TargetNetworkAPI from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.typing import StateDict @@ -61,7 +61,7 @@ def _forward_inference(self, batch: Dict) -> Dict[str, Any]: pi_encoder_outs = self.pi_encoder(batch) # Pi head. - output[SampleBatch.ACTION_DIST_INPUTS] = self.pi(pi_encoder_outs[ENCODER_OUT]) + output[Columns.ACTION_DIST_INPUTS] = self.pi(pi_encoder_outs[ENCODER_OUT]) return output @@ -79,8 +79,8 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]: output = {} # SAC needs also Q function values and action logits for next observations. - batch_curr = {SampleBatch.OBS: batch[SampleBatch.OBS]} - batch_next = {SampleBatch.OBS: batch[SampleBatch.NEXT_OBS]} + batch_curr = {Columns.OBS: batch[Columns.OBS]} + batch_next = {Columns.OBS: batch[Columns.NEXT_OBS]} # Encoder forward passes. pi_encoder_outs = self.pi_encoder(batch_curr) @@ -89,7 +89,7 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]: pi_encoder_next_outs = self.pi_encoder(batch_next) # Q-network(s) forward passes. - batch_curr.update({SampleBatch.ACTIONS: batch[SampleBatch.ACTIONS]}) + batch_curr.update({Columns.ACTIONS: batch[Columns.ACTIONS]}) output[QF_PREDS] = self._qf_forward_train_helper( batch_curr, self.qf_encoder, self.qf ) # self._qf_forward_train(batch_curr)[QF_PREDS] @@ -103,9 +103,64 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]: action_logits = self.pi(pi_encoder_outs[ENCODER_OUT]) # Also get the action logits for the next observations. action_logits_next = self.pi(pi_encoder_next_outs[ENCODER_OUT]) - output[SampleBatch.ACTION_DIST_INPUTS] = action_logits + output[Columns.ACTION_DIST_INPUTS] = action_logits output[ACTION_DIST_INPUTS_NEXT] = action_logits_next + # Get the train action distribution for the current policy and current state. + # This is needed for the policy (actor) loss in SAC. + action_dist_class = self.get_train_action_dist_cls() + action_dist_curr = action_dist_class.from_logits(action_logits) + # Get the train action distribution for the current policy and next state. + # For the Q (critic) loss in SAC, we need to sample from the current policy at + # the next state. + action_dist_next = action_dist_class.from_logits(action_logits_next) + + # Sample actions for the current state. Note that we need to apply the + # reparameterization trick (`rsample()` instead of `sample()`) to avoid the + # expectation over actions. + actions_resampled = action_dist_curr.rsample() + # Compute the log probabilities for the current state (for the critic loss). + output["logp_resampled"] = action_dist_curr.logp(actions_resampled) + + # Sample actions for the next state. + actions_next_resampled = action_dist_next.sample().detach() + # Compute the log probabilities for the next state. + output["logp_next_resampled"] = ( + action_dist_next.logp(actions_next_resampled) + ).detach() + + # Compute Q-values for the current policy in the current state with + # the sampled actions. + q_batch_curr = { + Columns.OBS: batch[Columns.OBS], + Columns.ACTIONS: actions_resampled, + } + # Make sure we perform a "straight-through gradient" pass here, + # ignoring the gradients of the q-net, however, still recording + # the gradients of the policy net (which was used to rsample the actions used + # here). This is different from doing `.detach()` or `with torch.no_grads()`, + # as these two methds would fully block all gradient recordings, including + # the needed policy ones. + all_params = ( + list(self.qf.parameters()) + + list(self.qf_encoder.parameters()) + + list(self.qf_twin.parameters()) + + list(self.qf_twin_encoder.parameters()) + ) + for param in all_params: + param.requires_grad = False + output["q_curr"] = self.compute_q_values(q_batch_curr) + for param in all_params: + param.requires_grad = True + + # Compute Q-values from the target Q network for the next state with the + # sampled actions for the next state. + q_batch_next = { + Columns.OBS: batch[Columns.NEXT_OBS], + Columns.ACTIONS: actions_next_resampled, + } + output["q_target_next"] = self.forward_target(q_batch_next).detach() + # Return the network outputs. return output @@ -149,7 +204,7 @@ def _qf_forward_train_helper( Args: batch: Dict containing a concatenated tensor with observations - and actions under the key `SampleBatch.OBS`. + and actions under the key `Columns.OBS`. encoder: An `Encoder` model for the Q state-action encoder. head: A `Model` for the Q head. @@ -158,8 +213,8 @@ def _qf_forward_train_helper( """ # Construct batch. Note, we need to feed observations and actions. qf_batch = { - SampleBatch.OBS: torch.concat( - (batch[SampleBatch.OBS], batch[SampleBatch.ACTIONS]), dim=-1 + Columns.OBS: torch.concat( + (batch[Columns.OBS], batch[Columns.ACTIONS]), dim=-1 ) } # Encoder forward pass. diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 47c4d9aa7a5b..99bb11de6e18 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -52,6 +52,7 @@ NUM_ENV_STEPS_SAMPLED_LIFETIME, NUM_ENV_STEPS_TRAINED, NUM_MODULE_STEPS_TRAINED, + LEARNER_CONNECTOR_TIMER, ) from ray.rllib.utils.metrics.metrics_logger import MetricsLogger from ray.rllib.utils.minibatch_utils import ( @@ -1252,22 +1253,23 @@ def _update_from_batch_or_episodes( # Call the learner connector. if self._learner_connector is not None and episodes is not None: # Call the learner connector pipeline. - shared_data = {} - batch = self._learner_connector( - rl_module=self.module, - data=batch if batch is not None else {}, - episodes=episodes, - shared_data=shared_data, - ) - # Convert to a batch. - # TODO (sven): Try to not require MultiAgentBatch anymore. - batch = MultiAgentBatch( - { - module_id: SampleBatch(module_data) - for module_id, module_data in batch.items() - }, - env_steps=sum(len(e) for e in episodes), - ) + with self.metrics.log_time((ALL_MODULES, LEARNER_CONNECTOR_TIMER)): + shared_data = {} + batch = self._learner_connector( + rl_module=self.module, + data=batch if batch is not None else {}, + episodes=episodes, + shared_data=shared_data, + ) + # Convert to a batch. + # TODO (sven): Try to not require MultiAgentBatch anymore. + batch = MultiAgentBatch( + { + module_id: SampleBatch(module_data) + for module_id, module_data in batch.items() + }, + env_steps=sum(len(e) for e in episodes), + ) # Have to convert to MultiAgentBatch. elif isinstance(batch, SampleBatch): assert len(self.module) == 1 diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index 525e15a081c0..fdfcdf1f3938 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -490,6 +490,7 @@ def _learner_update( partial( _learner_update, _episodes_shard=episodes_shard, + _timesteps=timesteps, _min_total_mini_batches=min_total_mini_batches, ) for episodes_shard in episodes @@ -529,6 +530,7 @@ def _learner_update( partial( _learner_update, _episodes_shard=eps_shard, + _timesteps=timesteps, _min_total_mini_batches=min_total_mini_batches, ) for eps_shard in eps_shards diff --git a/rllib/core/learner/tests/test_learner_group.py b/rllib/core/learner/tests/test_learner_group.py index 430c26c11b2b..ca51dffd7859 100644 --- a/rllib/core/learner/tests/test_learner_group.py +++ b/rllib/core/learner/tests/test_learner_group.py @@ -57,13 +57,8 @@ def local_training_helper(self, fw, scaling_mode) -> None: import torch torch.manual_seed(0) - elif fw == "tf2": - import tensorflow as tf - - # this is done by rllib already inside of the policy class, but we need to - # do it here for testing purposes - tf.compat.v1.enable_eager_execution() - tf.random.set_seed(0) + else: + raise NotImplementedError env = gym.make("CartPole-v1") @@ -215,7 +210,7 @@ def test_learner_group_build_from_algorithm_config(self): learner_group.shutdown() # def test_learner_group_local(self): - # fws = ["torch", "tf2"] + # fws = ["torch"] # test_iterator = itertools.product(fws, LOCAL_CONFIGS) @@ -231,7 +226,7 @@ def test_learner_group_build_from_algorithm_config(self): def test_update_multi_gpu(self): return - fws = ["torch", "tf2"] + fws = ["torch"] scaling_modes = ["multi-gpu-ddp", "remote-gpu"] test_iterator = itertools.product(fws, scaling_modes) @@ -273,8 +268,8 @@ def test_update_multi_gpu(self): del learner_group def test_add_module_and_remove_module(self): - fws = ["torch", "tf2"] - scaling_modes = ["local-cpu", "multi-gpu-ddp"] + fws = ["torch"] + scaling_modes = ["local-cpu", "multi-cpu-ddp"] test_iterator = itertools.product(fws, scaling_modes) for fw, scaling_mode in test_iterator: @@ -342,7 +337,7 @@ def tearDownClass(cls) -> None: def test_restore_from_path_multi_rl_module_and_individual_modules(self): """Tests whether MultiRLModule- and single RLModule states can be restored.""" - fws = ["torch", "tf2"] + fws = ["torch"] # this is expanded to more scaling modes on the release ci. scaling_modes = ["local-cpu", "multi-gpu-ddp"] @@ -450,7 +445,7 @@ def tearDownClass(cls) -> None: def test_save_to_path_and_restore_from_path(self): """Check that saving and loading learner group state works.""" - fws = ["torch", "tf2"] + fws = ["torch"] # this is expanded to more scaling modes on the release ci. scaling_modes = ["local-cpu", "multi-gpu-ddp"] test_iterator = itertools.product(fws, scaling_modes) @@ -542,7 +537,7 @@ def tearDown(cls) -> None: def test_async_update(self): """Test that async style updates converge to the same result as sync.""" - fws = ["torch", "tf2"] + fws = ["torch"] # async_update only needs to be tested for the most complex case. # so we'll only test it for multi-gpu-ddp. scaling_modes = ["multi-gpu-ddp", "remote-gpu"] diff --git a/rllib/tuned_examples/dqn/cartpole_dqn.py b/rllib/tuned_examples/dqn/cartpole_dqn.py index 6306e4bbab19..1955ea7764d0 100644 --- a/rllib/tuned_examples/dqn/cartpole_dqn.py +++ b/rllib/tuned_examples/dqn/cartpole_dqn.py @@ -1,13 +1,10 @@ from ray.rllib.algorithms.dqn import DQNConfig from ray.rllib.utils.test_utils import add_rllib_example_script_args -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - EVALUATION_RESULTS, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) -parser = add_rllib_example_script_args() +parser = add_rllib_example_script_args( + default_reward=450.0, + default_timesteps=200000, +) parser.set_defaults(enable_new_api_stack=True) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. @@ -15,20 +12,13 @@ config = ( DQNConfig() - .environment(env="CartPole-v1") - .rl_module( - # Settings identical to old stack. - model_config_dict={ - "fcnet_hiddens": [256], - "fcnet_activation": "tanh", - "epsilon": [(0, 1.0), (10000, 0.02)], - "fcnet_bias_initializer": "zeros_", - "post_fcnet_bias_initializer": "zeros_", - "post_fcnet_hiddens": [256], - }, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .environment(env="CartPole-v1") .training( - # Settings identical to old stack. + lr=0.0005 * (args.num_gpus or 1) ** 0.5, train_batch_size_per_learner=32, replay_buffer_config={ "type": "PrioritizedEpisodeReplayBuffer", @@ -36,35 +26,27 @@ "alpha": 0.6, "beta": 0.4, }, - n_step=3, + n_step=(2, 5), double_q=True, num_atoms=1, noisy=False, dueling=True, ) - .evaluation( - evaluation_interval=1, - evaluation_parallel_to_training=True, - evaluation_num_env_runners=1, - evaluation_duration="auto", - evaluation_config={ - "explore": False, - # TODO (sven): Add support for window=float(inf) and reduce=mean for - # evaluation episode_return_mean reductions (identical to old stack - # behavior, which does NOT use a window (100 by default) to reduce - # eval episode returns. - "metrics_num_episodes_for_smoothing": 4, + .rl_module( + # Settings identical to old stack. + model_config_dict={ + "fcnet_hiddens": [256], + "fcnet_activation": "tanh", + "epsilon": [(0, 1.0), (10000, 0.02)], + "fcnet_bias_initializer": "zeros_", + "post_fcnet_bias_initializer": "zeros_", + "post_fcnet_hiddens": [256], }, ) ) -stop = { - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 500.0, - NUM_ENV_STEPS_SAMPLED_LIFETIME: 100000, -} - if __name__ == "__main__": from ray.rllib.utils.test_utils import run_rllib_example_script_experiment - run_rllib_example_script_experiment(config, args, stop=stop) + run_rllib_example_script_experiment(config, args) diff --git a/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py b/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py index 5a6f763b94f2..94aac4c2c8f0 100644 --- a/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py +++ b/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py @@ -9,12 +9,17 @@ from ray.rllib.utils.test_utils import add_rllib_example_script_args -parser = add_rllib_example_script_args() -parser.set_defaults(num_agents=2) +parser = add_rllib_example_script_args( + default_timesteps=500000, +) +parser.set_defaults( + enable_new_api_stack=True, + num_agents=2, +) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values to set up `config` below. args = parser.parse_args() -parser.set_defaults(num_agents=2) + register_env( "multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": args.num_agents}), @@ -22,9 +27,13 @@ config = ( DQNConfig() + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment(env="multi_agent_cartpole") .training( - # Settings identical to old stack. + lr=0.0005 * (args.num_gpus or 1) ** 0.5, train_batch_size_per_learner=32, replay_buffer_config={ "type": "MultiAgentPrioritizedEpisodeReplayBuffer", @@ -32,7 +41,7 @@ "alpha": 0.6, "beta": 0.4, }, - n_step=3, + n_step=(2, 5), double_q=True, num_atoms=1, noisy=False, @@ -57,19 +66,17 @@ ) stop = { - NUM_ENV_STEPS_SAMPLED_LIFETIME: 500000, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, # `episode_return_mean` is the sum of all agents/policies' returns. f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 250.0 * args.num_agents, } if __name__ == "__main__": + + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + assert ( args.num_agents > 0 ), "The `--num-agents` arg must be > 0 for this script to work." - assert ( - args.enable_new_api_stack - ), "The `--enable-new-api-stack` arg must be activated for this script to work." - - from ray.rllib.utils.test_utils import run_rllib_example_script_experiment run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml b/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml deleted file mode 100644 index e31b4aaa6669..000000000000 --- a/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# @OldAPIStack -cartpole-sac: - env: CartPoleContinuousBulletEnv-v0 - run: SAC - stop: - env_runners/episode_return_mean: 40 - timesteps_total: 100000 - config: - # Works for both torch and tf. - framework: torch - gamma: 0.95 - n_step: 3 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - num_steps_sampled_before_learning_starts: 256 - initial_alpha: 0.2 - clip_actions: false - min_sample_timesteps_per_iteration: 1000 - optimization: - actor_learning_rate: 0.005 - critic_learning_rate: 0.005 - entropy_learning_rate: 0.0001 diff --git a/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml b/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml deleted file mode 100644 index f5307df86b15..000000000000 --- a/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# @OldAPIStack -halfcheetah-pybullet-sac: - env: HalfCheetahBulletEnv-v0 - run: SAC - stop: - env_runners/episode_return_mean: 800.0 - config: - # Works for both torch and tf. - framework: torch - q_model_config: - fcnet_activation: relu - fcnet_hiddens: [256, 256] - policy_model_config: - fcnet_activation: relu - fcnet_hiddens: [256, 256] - tau: 0.005 - target_entropy: auto - n_step: 3 - rollout_fragment_length: 1 - train_batch_size: 256 - target_network_update_freq: 1 - min_sample_timesteps_per_iteration: 1000 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - num_steps_sampled_before_learning_starts: 10000 - optimization: - actor_learning_rate: 0.0003 - critic_learning_rate: 0.0003 - entropy_learning_rate: 0.0003 - num_env_runners: 0 - num_gpus: 1 - metrics_num_episodes_for_smoothing: 5 - diff --git a/rllib/tuned_examples/sac/halfcheetah-sac.yaml b/rllib/tuned_examples/sac/halfcheetah-sac.yaml deleted file mode 100644 index c1804440a0d3..000000000000 --- a/rllib/tuned_examples/sac/halfcheetah-sac.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# @OldAPIStack -# Our implementation of SAC can reach 9k reward in 400k timesteps -halfcheetah_sac: - env: HalfCheetah-v3 - run: SAC - stop: - env_runners/episode_return_mean: 9000 - config: - # Works for both torch and tf. - framework: torch - q_model_config: - fcnet_activation: relu - fcnet_hiddens: [256, 256] - policy_model_config: - fcnet_activation: relu - fcnet_hiddens: [256, 256] - tau: 0.005 - target_entropy: auto - n_step: 1 - rollout_fragment_length: 1 - train_batch_size: 256 - target_network_update_freq: 1 - min_sample_timesteps_per_iteration: 1000 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - num_steps_sampled_before_learning_starts: 10000 - optimization: - actor_learning_rate: 0.0003 - critic_learning_rate: 0.0003 - entropy_learning_rate: 0.0003 - num_env_runners: 0 - num_gpus: 0 - clip_actions: false - normalize_actions: true - evaluation_interval: 1 - metrics_num_episodes_for_smoothing: 5 - diff --git a/rllib/tuned_examples/sac/halfcheetah_sac.py b/rllib/tuned_examples/sac/halfcheetah_sac.py new file mode 100644 index 000000000000..5a3bb9b3a7a4 --- /dev/null +++ b/rllib/tuned_examples/sac/halfcheetah_sac.py @@ -0,0 +1,62 @@ +from torch import nn + +from ray.rllib.algorithms.sac.sac import SACConfig +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args( + default_timesteps=1000000, + default_reward=12000.0, + default_iters=2000, +) +parser.set_defaults(enable_new_api_stack=True) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +config = ( + SACConfig() + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .environment("HalfCheetah-v4") + .training( + initial_alpha=1.001, + # lr=0.0006 is very high, w/ 4 GPUs -> 0.0012 + # Might want to lower it for better stability, but it does learn well. + lr=0.0004 * (args.num_gpus or 1) ** 0.5, + target_entropy="auto", + n_step=(1, 5), # 1? + tau=0.005, + train_batch_size_per_learner=256, + target_network_update_freq=1, + replay_buffer_config={ + "type": "PrioritizedEpisodeReplayBuffer", + "capacity": 100000, + "alpha": 0.6, + "beta": 0.4, + }, + num_steps_sampled_before_learning_starts=10000, + ) + .rl_module( + model_config_dict={ + "fcnet_hiddens": [256, 256], + "fcnet_activation": "relu", + "fcnet_weights_initializer": nn.init.xavier_uniform_, + "post_fcnet_hiddens": [], + "post_fcnet_activation": None, + "post_fcnet_weights_initializer": "orthogonal_", + "post_fcnet_weights_initializer_config": {"gain": 0.01}, + } + ) + .reporting( + metrics_num_episodes_for_smoothing=5, + min_sample_timesteps_per_iteration=1000, + ) +) + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args) diff --git a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py index 59c1fa5bbf36..de3669c9bb29 100644 --- a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py +++ b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py @@ -1,6 +1,7 @@ +from torch import nn + from ray.rllib.algorithms.sac import SACConfig from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum -from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, @@ -9,10 +10,14 @@ from ray.rllib.utils.test_utils import add_rllib_example_script_args from ray.tune.registry import register_env -torch, nn = try_import_torch() -parser = add_rllib_example_script_args() -parser.set_defaults(num_agents=2) +parser = add_rllib_example_script_args( + default_timesteps=500000, +) +parser.set_defaults( + enable_new_api_stack=True, + num_agents=2, +) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values to set up `config` below. args = parser.parse_args() @@ -24,12 +29,16 @@ config = ( SACConfig() - .environment(env="multi_agent_pendulum") + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .environment("multi_agent_pendulum") .training( initial_alpha=1.001, - lr=8e-4, + lr=0.001 * ((args.num_gpus or 1) ** 0.5), target_entropy="auto", - n_step=1, + n_step=(2, 5), tau=0.005, train_batch_size_per_learner=256, target_network_update_freq=1, @@ -44,17 +53,16 @@ .rl_module( model_config_dict={ "fcnet_hiddens": [256, 256], - "fcnet_activation": "tanh", + "fcnet_activation": "relu", "fcnet_weights_initializer": nn.init.xavier_uniform_, - # "post_fcnet_hiddens": [], - # "post_fcnet_activation": None, - # "post_fcnet_weights_initializer": nn.init.orthogonal_, - # "post_fcnet_weights_initializer_config": {"gain": 0.01}, + "post_fcnet_hiddens": [], + "post_fcnet_activation": None, + "post_fcnet_weights_initializer": nn.init.orthogonal_, + "post_fcnet_weights_initializer_config": {"gain": 0.01}, } ) .reporting( metrics_num_episodes_for_smoothing=5, - min_sample_timesteps_per_iteration=1000, ) ) @@ -65,7 +73,7 @@ ) stop = { - NUM_ENV_STEPS_SAMPLED_LIFETIME: 500000, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, # `episode_return_mean` is the sum of all agents/policies' returns. f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0 * args.num_agents, } @@ -74,10 +82,6 @@ assert ( args.num_agents > 0 ), "The `--num-agents` arg must be > 0 for this script to work." - assert ( - args.enable_new_api_stack - ), "The `--enable-new-api-stack` arg must be activated for this script to work." - from ray.rllib.utils.test_utils import run_rllib_example_script_experiment run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/tuned_examples/sac/pendulum_sac.py b/rllib/tuned_examples/sac/pendulum_sac.py index 5be9332f995e..69746c4478e9 100644 --- a/rllib/tuned_examples/sac/pendulum_sac.py +++ b/rllib/tuned_examples/sac/pendulum_sac.py @@ -1,3 +1,5 @@ +from torch import nn + from ray.rllib.algorithms.sac.sac import SACConfig from ray.rllib.utils.test_utils import add_rllib_example_script_args @@ -5,6 +7,7 @@ default_timesteps=20000, default_reward=-250.0, ) +parser.set_defaults(enable_new_api_stack=True) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values to set up `config` below. args = parser.parse_args() @@ -15,12 +18,12 @@ enable_rl_module_and_learner=True, enable_env_runner_and_connector_v2=True, ) - .environment(env="Pendulum-v1") + .environment("Pendulum-v1") .training( initial_alpha=1.001, - lr=3e-4, + lr=0.001 * (args.num_gpus or 1) ** 0.5, target_entropy="auto", - n_step=1, + n_step=(2, 5), tau=0.005, train_batch_size_per_learner=256, target_network_update_freq=1, @@ -30,12 +33,13 @@ "alpha": 1.0, "beta": 0.0, }, - num_steps_sampled_before_learning_starts=256, + num_steps_sampled_before_learning_starts=256 * (args.num_gpus or 1), ) .rl_module( model_config_dict={ "fcnet_hiddens": [256, 256], "fcnet_activation": "relu", + "fcnet_weights_initializer": nn.init.xavier_uniform_, "post_fcnet_hiddens": [], "post_fcnet_activation": None, "post_fcnet_weights_initializer": "orthogonal_", diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py index 33d9c592af42..2cbac3f12a5e 100644 --- a/rllib/utils/metrics/__init__.py +++ b/rllib/utils/metrics/__init__.py @@ -95,10 +95,12 @@ SAMPLE_TIMER = "sample" # @OldAPIStack ENV_RUNNER_SAMPLING_TIMER = "env_runner_sampling_timer" OFFLINE_SAMPLING_TIMER = "offline_sampling_timer" +REPLAY_BUFFER_ADD_DATA_TIMER = "replay_buffer_add_data_timer" REPLAY_BUFFER_SAMPLE_TIMER = "replay_buffer_sampling_timer" REPLAY_BUFFER_UPDATE_PRIOS_TIMER = "replay_buffer_update_prios_timer" LEARNER_UPDATE_TIMER = "learner_update_timer" LEARN_ON_BATCH_TIMER = "learn" # @OldAPIStack +LEARNER_CONNECTOR_TIMER = "learner_connector_timer" LOAD_BATCH_TIMER = "load" TARGET_NET_UPDATE_TIMER = "target_net_update" diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py index 8c576234e436..c3a75589bc19 100644 --- a/rllib/utils/minibatch_utils.py +++ b/rllib/utils/minibatch_utils.py @@ -122,7 +122,6 @@ def get_len(b): ) else: - # n_steps = self._minibatch_size def get_len(b): return len(b)