diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst
index 039ebdae0c5a..0a776648322d 100644
--- a/doc/source/rllib/rllib-algorithms.rst
+++ b/doc/source/rllib/rllib-algorithms.rst
@@ -26,7 +26,7 @@ as well as multi-GPU training on multi-node (GPU) clusters when using the `Anysc
 +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+
 | :ref:`DQN/Rainbow (Deep Q Networks) <dqn>`                                  | |single_agent| |multi_agent| | |multi_gpu| |multi_node_multi_gpu| |                |discr_actions| |
 +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+
-| :ref:`SAC (Soft Actor Critic) <sac>`                                        | |single_agent|               | |multi_gpu| |multi_node_multi_gpu| | |cont_actions|                 |
+| :ref:`SAC (Soft Actor Critic) <sac>`                                        | |single_agent| |multi_agent| | |multi_gpu| |multi_node_multi_gpu| | |cont_actions|                 |
 +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+
 | **High-throughput on- and off policy**                                                                                                                                           |
 +-----------------------------------------------------------------------------+------------------------------+------------------------------------+--------------------------------+
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 26165cb40852..a91f901a6c53 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -2790,6 +2790,42 @@
       cluster:
         cluster_compute: 8gpus_96cpus_gce.yaml
 
+
+# --------------------------
+# SAC
+# --------------------------
+- name: rllib_learning_tests_halfcheetah_sac_torch
+  group: RLlib tests
+  working_dir: rllib_tests
+
+  stable: true
+
+  frequency: nightly
+  team: rllib
+  cluster:
+    byod:
+      type: gpu
+      post_build_script: byod_rllib_test.sh
+      runtime_env:
+        - RLLIB_TEST_NO_JAX_IMPORT=1
+        - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin
+    cluster_compute: 4gpus_64cpus.yaml
+
+  run:
+    timeout: 7200
+    script: python learning_tests/tuned_examples/sac/halfcheetah_sac.py --enable-new-api-stack --num-gpus=4 --num-env-runners=8 --stop-reward=1000.0 --as-release-test
+
+  alert: default
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: 4gpus_64cpus_gce.yaml
+
+
 ########################
 # Core Nightly Tests
 ########################
diff --git a/rllib/BUILD b/rllib/BUILD
index 6b8a48f36e3e..c74e9dfb4c48 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -349,6 +349,30 @@ py_test(
     srcs = ["tuned_examples/dqn/cartpole_dqn.py"],
     args = ["--as-test", "--enable-new-api-stack"]
 )
+py_test(
+    name = "learning_tests_cartpole_dqn_gpu",
+    main = "tuned_examples/dqn/cartpole_dqn.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/dqn/cartpole_dqn.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_cartpole_dqn_multi_cpu",
+    main = "tuned_examples/dqn/cartpole_dqn.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/dqn/cartpole_dqn.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+py_test(
+    name = "learning_tests_cartpole_dqn_multi_gpu",
+    main = "tuned_examples/dqn/cartpole_dqn.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/dqn/cartpole_dqn.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
 # MultiAgentCartPole
 py_test(
     name = "learning_tests_multi_agent_cartpole_dqn",
@@ -358,16 +382,29 @@ py_test(
     srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"],
     args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4"]
 )
-
-#@OldAPIStack
 py_test(
-    name = "learning_tests_cartpole_dqn_softq_old_api_stack",
-    main = "tests/run_regression_tests.py",
-    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete"],
-    size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer
-    srcs = ["tests/run_regression_tests.py"],
-    data = ["tuned_examples/dqn/cartpole-dqn-softq.yaml"],
-    args = ["--dir=tuned_examples/dqn"]
+    name = "learning_tests_multi_agent_cartpole_dqn_gpu",
+    main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_multi_agent_cartpole_dqn_multi_cpu",
+    main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=5", "--num-gpus=2"]
+)
+py_test(
+    name = "learning_tests_multi_agent_cartpole_dqn_multi_gpu",
+    main = "tuned_examples/dqn/multi_agent_cartpole_dqn.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/dqn/multi_agent_cartpole_dqn.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=2"]
 )
 
 # IMPALA
@@ -669,7 +706,31 @@ py_test(
     srcs = ["tuned_examples/sac/pendulum_sac.py"],
     args = ["--as-test", "--enable-new-api-stack"]
 )
-
+py_test(
+    name = "learning_tests_pendulum_sac_gpu",
+    main = "tuned_examples/sac/pendulum_sac.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/sac/pendulum_sac.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_pendulum_sac_multi_cpu",
+    main = "tuned_examples/sac/pendulum_sac.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous"],
+    size = "large",
+    srcs = ["tuned_examples/sac/pendulum_sac.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+py_test(
+    name = "learning_tests_pendulum_sac_multi_gpu",
+    main = "tuned_examples/sac/pendulum_sac.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/sac/pendulum_sac.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+# MultiAgentPendulum
 py_test(
     name = "learning_tests_multi_agent_pendulum_sac",
     main = "tuned_examples/sac/multi_agent_pendulum_sac.py",
@@ -678,7 +739,22 @@ py_test(
     srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"],
     args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4"]
 )
-
+py_test(
+    name = "learning_tests_multi_agent_pendulum_sac_gpu",
+    main = "tuned_examples/sac/multi_agent_pendulum_sac.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-cpus=4", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_multi_agent_pendulum_sac_multi_cpu",
+    main = "tuned_examples/sac/multi_agent_pendulum_sac.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_continuous"],
+    size = "large",
+    srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"],
+    args = ["--enable-new-api-stack", "--num-agents=2", "--num-gpus=2"]
+)
 py_test(
     name = "learning_tests_multi_agent_pendulum_sac_multi_gpu",
     main = "tuned_examples/sac/multi_agent_pendulum_sac.py",
@@ -3240,7 +3316,7 @@ py_test(
     name = "examples/rl_modules/custom_lstm_rl_module",
     main = "examples/rl_modules/custom_lstm_rl_module.py",
     tags = ["team:rllib", "examples"],
-    size = "medium",
+    size = "large",
     srcs = ["examples/rl_modules/custom_lstm_rl_module.py"],
     args = ["--as-test", "--enable-new-api-stack"],
 )
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 39379d206839..95bb0ba23baf 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -109,9 +109,7 @@
     ENV_RUNNER_RESULTS,
     ENV_RUNNER_SAMPLING_TIMER,
     EPISODE_LEN_MEAN,
-    EPISODE_RETURN_MAX,
     EPISODE_RETURN_MEAN,
-    EPISODE_RETURN_MIN,
     EVALUATION_ITERATION_TIMER,
     EVALUATION_RESULTS,
     FAULT_TOLERANCE_STATS,
@@ -1701,7 +1699,7 @@ def training_step(self) -> ResultDict:
             if self.config.count_steps_by == "agent_steps":
                 train_batch, env_runner_results = synchronous_parallel_sample(
                     worker_set=self.env_runner_group,
-                    max_agent_steps=self.config.train_batch_size,
+                    max_agent_steps=self.config.total_train_batch_size,
                     sample_timeout_s=self.config.sample_timeout_s,
                     _uses_new_env_runners=(
                         self.config.enable_env_runner_and_connector_v2
@@ -1711,7 +1709,7 @@ def training_step(self) -> ResultDict:
             else:
                 train_batch, env_runner_results = synchronous_parallel_sample(
                     worker_set=self.env_runner_group,
-                    max_env_steps=self.config.train_batch_size,
+                    max_env_steps=self.config.total_train_batch_size,
                     sample_timeout_s=self.config.sample_timeout_s,
                     _uses_new_env_runners=(
                         self.config.enable_env_runner_and_connector_v2
@@ -3846,21 +3844,23 @@ def _compile_iteration_results_new_api_stack(
         # Return dict (shallow copy of `train_results`).
         results: ResultDict = train_results.copy()
 
-        # TODO (sven): Fix Tune, instead, to be tolerant against possibly missing result
-        #  keys. Otherwise, we'll have to guess here, what "popular" keys users use in
-        #  order to protect them from running into Tune KeyErrors.
-        if ENV_RUNNER_RESULTS not in results:
-            results[ENV_RUNNER_RESULTS] = {}
-        for must_have in [
-            EPISODE_RETURN_MEAN,
-            EPISODE_RETURN_MIN,
-            EPISODE_RETURN_MAX,
-        ]:
-            if must_have not in results[ENV_RUNNER_RESULTS]:
-                results[ENV_RUNNER_RESULTS][must_have] = np.nan
+        # Collect old-API-stack-style `self._timers` results.
+        for k, timer in self._timers.items():
+            if TIMERS not in results:
+                results[TIMERS] = {}
+            results[TIMERS]["{}_time_sec".format(k)] = timer.mean
+            if timer.has_units_processed():
+                results[TIMERS]["{}_throughput".format(k)] = round(
+                    timer.mean_throughput, 3
+                )
 
         # Evaluation results.
         if eval_results:
+            assert (
+                isinstance(eval_results, dict)
+                and len(eval_results) == 1
+                and EVALUATION_RESULTS in eval_results
+            )
             results.update(eval_results)
         # Fault tolerance stats.
         results[FAULT_TOLERANCE_STATS] = {
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index b516b5a8f746..63e9aafd71ef 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -376,9 +376,9 @@ def __init__(self, algo_class: Optional[type] = None):
         self.lr = 0.001
         self.grad_clip = None
         self.grad_clip_by = "global_norm"
-        self.train_batch_size = 32
         # Simple logic for now: If None, use `train_batch_size`.
         self.train_batch_size_per_learner = None
+        self.train_batch_size = 32  # @OldAPIStack
         # TODO (sven): Unsolved problem with RLModules sometimes requiring settings from
         #  the main AlgorithmConfig. We should not require the user to provide those
         #  settings in both, the AlgorithmConfig (as property) AND the model config
diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py
index 25f43e9a020e..2f2e21b18db5 100644
--- a/rllib/algorithms/dqn/dqn.py
+++ b/rllib/algorithms/dqn/dqn.py
@@ -58,6 +58,7 @@
     NUM_MODULE_STEPS_TRAINED,
     NUM_MODULE_STEPS_TRAINED_LIFETIME,
     NUM_TARGET_UPDATES,
+    REPLAY_BUFFER_ADD_DATA_TIMER,
     REPLAY_BUFFER_SAMPLE_TIMER,
     REPLAY_BUFFER_UPDATE_PRIOS_TIMER,
     SAMPLE_TIMER,
@@ -556,7 +557,7 @@ def calculate_rr_weights(config: AlgorithmConfig) -> List[float]:
     # This is to set freshly rollout-collected data in relation to
     # the data we pull from the replay buffer (which also contains old
     # samples).
-    native_ratio = config.train_batch_size / (
+    native_ratio = config.total_train_batch_size / (
         config.get_rollout_fragment_length()
         * config.num_envs_per_env_runner
         # Add one to workers because the local
@@ -628,13 +629,15 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
                     _uses_new_env_runners=True,
                     _return_metrics=True,
                 )
-            # Add the sampled experiences to the replay buffer.
-            self.local_replay_buffer.add(episodes)
             # Reduce EnvRunner metrics over the n EnvRunners.
             self.metrics.merge_and_log_n_dicts(
                 env_runner_results, key=ENV_RUNNER_RESULTS
             )
 
+            # Add the sampled experiences to the replay buffer.
+            with self.metrics.log_time((TIMERS, REPLAY_BUFFER_ADD_DATA_TIMER)):
+                self.local_replay_buffer.add(episodes)
+
         self.metrics.log_dict(
             self.metrics.peek(
                 (ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED), default={}
@@ -684,7 +687,7 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
                 # Sample a list of episodes used for learning from the replay buffer.
                 with self.metrics.log_time((TIMERS, REPLAY_BUFFER_SAMPLE_TIMER)):
                     episodes = self.local_replay_buffer.sample(
-                        num_items=self.config.train_batch_size,
+                        num_items=self.config.total_train_batch_size,
                         n_step=self.config.n_step,
                         gamma=self.config.gamma,
                         beta=self.config.replay_buffer_config.get("beta"),
@@ -707,14 +710,16 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
                     # disk or WandB, they might be very large).
                     td_errors = defaultdict(list)
                     for res in learner_results:
-                        for mid, m_res in res.items():
-                            if TD_ERROR_KEY in m_res:
-                                td_errors[mid].extend(
-                                    convert_to_numpy(m_res.pop(TD_ERROR_KEY).peek())
+                        for module_id, module_results in res.items():
+                            if TD_ERROR_KEY in module_results:
+                                td_errors[module_id].extend(
+                                    convert_to_numpy(
+                                        module_results.pop(TD_ERROR_KEY).peek()
+                                    )
                                 )
                     td_errors = {
-                        mid: {TD_ERROR_KEY: np.concatenate(s, axis=0)}
-                        for mid, s in td_errors.items()
+                        module_id: {TD_ERROR_KEY: np.concatenate(s, axis=0)}
+                        for module_id, s in td_errors.items()
                     }
                     self.metrics.merge_and_log_n_dicts(
                         learner_results, key=LEARNER_RESULTS
@@ -812,7 +817,7 @@ def _training_step_old_and_hybrid_api_stack(self) -> ResultDict:
                 # Sample training batch (MultiAgentBatch) from replay buffer.
                 train_batch = sample_min_n_steps_from_buffer(
                     self.local_replay_buffer,
-                    self.config.train_batch_size,
+                    self.config.total_train_batch_size,
                     count_by_agent_steps=self.config.count_steps_by == "agent_steps",
                 )
 
diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py
index 2f3bc8d11489..f4fc852f3586 100644
--- a/rllib/algorithms/sac/sac.py
+++ b/rllib/algorithms/sac/sac.py
@@ -100,7 +100,8 @@ def __init__(self, algo_class=None):
         }
 
         # .training()
-        self.train_batch_size = 256
+        self.train_batch_size_per_learner = 256
+        self.train_batch_size = 256  # @OldAPIstack
         # Number of timesteps to collect from rollout workers before we start
         # sampling from replay buffers for learning. Whether we count this in agent
         # steps  or environment steps depends on config.multi_agent(count_steps_by=..).
diff --git a/rllib/algorithms/sac/sac_learner.py b/rllib/algorithms/sac/sac_learner.py
index bcb18a25ae56..58703174742a 100644
--- a/rllib/algorithms/sac/sac_learner.py
+++ b/rllib/algorithms/sac/sac_learner.py
@@ -31,7 +31,11 @@ def build(self) -> None:
         self.curr_log_alpha: Dict[ModuleID, TensorType] = LambdaDefaultDict(
             lambda module_id: self._get_tensor_variable(
                 # Note, we want to train the temperature parameter.
-                [np.log(self.config.get_config_for_module(module_id).initial_alpha)],
+                [
+                    np.log(
+                        self.config.get_config_for_module(module_id).initial_alpha
+                    ).astype(np.float32)
+                ],
                 trainable=True,
             )
         )
diff --git a/rllib/algorithms/sac/torch/sac_torch_learner.py b/rllib/algorithms/sac/torch/sac_torch_learner.py
index 52e9b9ec8dda..f87d46f2e4b7 100644
--- a/rllib/algorithms/sac/torch/sac_torch_learner.py
+++ b/rllib/algorithms/sac/torch/sac_torch_learner.py
@@ -35,7 +35,7 @@ class SACTorchLearner(DQNRainbowTorchLearner, SACLearner):
 
     This ' Learner' class implements the loss in its
     `self.compute_loss_for_module()` method. In addition, it updates
-    target networks in its inherited method `_update_module_target_networks`.
+    the target networks of the RLModule(s).
     """
 
     # TODO (simon): Set different learning rates for optimizers.
@@ -109,73 +109,20 @@ def compute_loss_for_module(
         batch: Dict[str, Any],
         fwd_out: Dict[str, TensorType]
     ) -> TensorType:
-        # Only for debugging.
-        deterministic = config._deterministic_loss
-
         # Receive the current alpha hyperparameter.
         alpha = torch.exp(self.curr_log_alpha[module_id])
 
-        module = self.module[module_id].unwrapped()
-
-        # Get the train action distribution for the current policy and current state.
-        # This is needed for the policy (actor) loss in SAC.
-        action_dist_class = module.get_train_action_dist_cls()
-        action_dist_curr = action_dist_class.from_logits(
-            fwd_out[Columns.ACTION_DIST_INPUTS]
-        )
-        # Get the train action distribution for the current policy and next state.
-        # For the Q (critic) loss in SAC, we need to sample from the current policy at
-        # the next state.
-        action_dist_next = action_dist_class.from_logits(
-            fwd_out["action_dist_inputs_next"]
-        )
-
-        # Sample actions for the current state. Note that we need to apply the
-        # reparameterization trick here to avoid the expectation over actions.
-        actions_curr = (
-            action_dist_curr.rsample()
-            if not deterministic
-            # If deterministic, we use the mean.
-            else action_dist_curr.to_deterministic().sample()
-        )
-        # Compute the log probabilities for the current state (for the critic loss).
-        logps_curr = action_dist_curr.logp(actions_curr)
-
-        # Sample actions for the next state.
-        actions_next = (
-            action_dist_next.sample()
-            if not deterministic
-            # If deterministic, we use the mean.
-            else action_dist_next.to_deterministic().sample()
-        )
-        # Compute the log probabilities for the next state.
-        logps_next = action_dist_next.logp(actions_next)
-
         # Get Q-values for the actually selected actions during rollout.
         # In the critic loss we use these as predictions.
         q_selected = fwd_out[QF_PREDS]
         if config.twin_q:
             q_twin_selected = fwd_out[QF_TWIN_PREDS]
 
-        # Compute Q-values for the current policy in the current state with
-        # the sampled actions.
-        q_batch_curr = {
-            Columns.OBS: batch[Columns.OBS],
-            Columns.ACTIONS: actions_curr,
-        }
-        q_curr = module.compute_q_values(q_batch_curr)
-
-        # Compute Q-values from the target Q network for the next state with the
-        # sampled actions for the next state.
-        q_batch_next = {
-            Columns.OBS: batch[Columns.NEXT_OBS],
-            Columns.ACTIONS: actions_next,
-        }
-        q_target_next = module.forward_target(q_batch_next)
-
         # Compute value function for next state (see eq. (3) in Haarnoja et al. (2018)).
         # Note, we use here the sampled actions in the log probabilities.
-        q_target_next -= alpha * logps_next
+        q_target_next = (
+            fwd_out["q_target_next"] - alpha.detach() * fwd_out["logp_next_resampled"]
+        )
         # Now mask all Q-values with terminated next states in the targets.
         q_next_masked = (1.0 - batch[Columns.TERMINATEDS].float()) * q_target_next
 
@@ -215,10 +162,16 @@ def compute_loss_for_module(
             )
 
         # For the actor (policy) loss we need sampled actions from the current policy
-        # evaluated at the current state.
+        # evaluated at the current observations.
+        # Note that the `q_curr` tensor below has the q-net's gradients ignored, while
+        # having the policy's gradients registered. The policy net was used to rsample
+        # actions used to compute `q_curr` (by passing these actions through the q-net).
+        # Hence, we can't do `fwd_out[q_curr].detach()`!
         # Note further, we minimize here, while the original equation in Haarnoja et
         # al. (2018) considers maximization.
-        actor_loss = torch.mean(alpha.detach() * logps_curr - q_curr)
+        actor_loss = torch.mean(
+            alpha.detach() * fwd_out["logp_resampled"] - fwd_out["q_curr"]
+        )
 
         # Optimize also the hyperparameter alpha by using the current policy
         # evaluated at the current state (sampled values).
@@ -226,7 +179,7 @@ def compute_loss_for_module(
         # to optimize and monotonic function. Original equation uses alpha.
         alpha_loss = -torch.mean(
             self.curr_log_alpha[module_id]
-            * (logps_curr.detach() + self.target_entropy[module_id])
+            * (fwd_out["logp_resampled"].detach() + self.target_entropy[module_id])
         )
 
         total_loss = actor_loss + critic_loss + alpha_loss
@@ -254,11 +207,10 @@ def compute_loss_for_module(
                 "alpha_value": alpha,
                 "log_alpha_value": torch.log(alpha),
                 "target_entropy": self.target_entropy[module_id],
-                "actions_curr_policy": torch.mean(actions_curr),
-                LOGPS_KEY: torch.mean(logps_curr),
-                QF_MEAN_KEY: torch.mean(q_curr),
-                QF_MAX_KEY: torch.max(q_curr),
-                QF_MIN_KEY: torch.min(q_curr),
+                LOGPS_KEY: torch.mean(fwd_out["logp_resampled"]),
+                QF_MEAN_KEY: torch.mean(fwd_out["q_curr"]),
+                QF_MAX_KEY: torch.max(fwd_out["q_curr"]),
+                QF_MIN_KEY: torch.min(fwd_out["q_curr"]),
                 TD_ERROR_MEAN_KEY: torch.mean(td_error),
             },
             key=module_id,
@@ -294,11 +246,9 @@ def compute_gradients(
                     retain_graph=True
                 )
                 # Store the gradients for the component and module.
-                # TODO (simon): Check another time the graph for overlapping
-                # gradients.
                 grads.update(
                     {
-                        pid: p.grad.clone()
+                        pid: p.grad
                         for pid, p in self.filter_param_dict_for_optimizer(
                             self._params, optim
                         ).items()
diff --git a/rllib/algorithms/sac/torch/sac_torch_rl_module.py b/rllib/algorithms/sac/torch/sac_torch_rl_module.py
index 61d920609203..957e6a9ebf32 100644
--- a/rllib/algorithms/sac/torch/sac_torch_rl_module.py
+++ b/rllib/algorithms/sac/torch/sac_torch_rl_module.py
@@ -6,11 +6,11 @@
     QF_TWIN_PREDS,
 )
 from ray.rllib.algorithms.sac.sac_rl_module import SACRLModule
+from ray.rllib.core.columns import Columns
 from ray.rllib.core.models.base import ENCODER_OUT, Encoder, Model
 from ray.rllib.core.rl_module.apis.target_network_api import TargetNetworkAPI
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
 from ray.rllib.core.rl_module.rl_module import RLModule
-from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.typing import StateDict
@@ -61,7 +61,7 @@ def _forward_inference(self, batch: Dict) -> Dict[str, Any]:
         pi_encoder_outs = self.pi_encoder(batch)
 
         # Pi head.
-        output[SampleBatch.ACTION_DIST_INPUTS] = self.pi(pi_encoder_outs[ENCODER_OUT])
+        output[Columns.ACTION_DIST_INPUTS] = self.pi(pi_encoder_outs[ENCODER_OUT])
 
         return output
 
@@ -79,8 +79,8 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]:
         output = {}
 
         # SAC needs also Q function values and action logits for next observations.
-        batch_curr = {SampleBatch.OBS: batch[SampleBatch.OBS]}
-        batch_next = {SampleBatch.OBS: batch[SampleBatch.NEXT_OBS]}
+        batch_curr = {Columns.OBS: batch[Columns.OBS]}
+        batch_next = {Columns.OBS: batch[Columns.NEXT_OBS]}
 
         # Encoder forward passes.
         pi_encoder_outs = self.pi_encoder(batch_curr)
@@ -89,7 +89,7 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]:
         pi_encoder_next_outs = self.pi_encoder(batch_next)
 
         # Q-network(s) forward passes.
-        batch_curr.update({SampleBatch.ACTIONS: batch[SampleBatch.ACTIONS]})
+        batch_curr.update({Columns.ACTIONS: batch[Columns.ACTIONS]})
         output[QF_PREDS] = self._qf_forward_train_helper(
             batch_curr, self.qf_encoder, self.qf
         )  # self._qf_forward_train(batch_curr)[QF_PREDS]
@@ -103,9 +103,64 @@ def _forward_train(self, batch: Dict) -> Dict[str, Any]:
         action_logits = self.pi(pi_encoder_outs[ENCODER_OUT])
         # Also get the action logits for the next observations.
         action_logits_next = self.pi(pi_encoder_next_outs[ENCODER_OUT])
-        output[SampleBatch.ACTION_DIST_INPUTS] = action_logits
+        output[Columns.ACTION_DIST_INPUTS] = action_logits
         output[ACTION_DIST_INPUTS_NEXT] = action_logits_next
 
+        # Get the train action distribution for the current policy and current state.
+        # This is needed for the policy (actor) loss in SAC.
+        action_dist_class = self.get_train_action_dist_cls()
+        action_dist_curr = action_dist_class.from_logits(action_logits)
+        # Get the train action distribution for the current policy and next state.
+        # For the Q (critic) loss in SAC, we need to sample from the current policy at
+        # the next state.
+        action_dist_next = action_dist_class.from_logits(action_logits_next)
+
+        # Sample actions for the current state. Note that we need to apply the
+        # reparameterization trick (`rsample()` instead of `sample()`) to avoid the
+        # expectation over actions.
+        actions_resampled = action_dist_curr.rsample()
+        # Compute the log probabilities for the current state (for the critic loss).
+        output["logp_resampled"] = action_dist_curr.logp(actions_resampled)
+
+        # Sample actions for the next state.
+        actions_next_resampled = action_dist_next.sample().detach()
+        # Compute the log probabilities for the next state.
+        output["logp_next_resampled"] = (
+            action_dist_next.logp(actions_next_resampled)
+        ).detach()
+
+        # Compute Q-values for the current policy in the current state with
+        # the sampled actions.
+        q_batch_curr = {
+            Columns.OBS: batch[Columns.OBS],
+            Columns.ACTIONS: actions_resampled,
+        }
+        # Make sure we perform a "straight-through gradient" pass here,
+        # ignoring the gradients of the q-net, however, still recording
+        # the gradients of the policy net (which was used to rsample the actions used
+        # here). This is different from doing `.detach()` or `with torch.no_grads()`,
+        # as these two methds would fully block all gradient recordings, including
+        # the needed policy ones.
+        all_params = (
+            list(self.qf.parameters())
+            + list(self.qf_encoder.parameters())
+            + list(self.qf_twin.parameters())
+            + list(self.qf_twin_encoder.parameters())
+        )
+        for param in all_params:
+            param.requires_grad = False
+        output["q_curr"] = self.compute_q_values(q_batch_curr)
+        for param in all_params:
+            param.requires_grad = True
+
+        # Compute Q-values from the target Q network for the next state with the
+        # sampled actions for the next state.
+        q_batch_next = {
+            Columns.OBS: batch[Columns.NEXT_OBS],
+            Columns.ACTIONS: actions_next_resampled,
+        }
+        output["q_target_next"] = self.forward_target(q_batch_next).detach()
+
         # Return the network outputs.
         return output
 
@@ -149,7 +204,7 @@ def _qf_forward_train_helper(
 
         Args:
             batch: Dict containing a concatenated tensor with observations
-                and actions under the key `SampleBatch.OBS`.
+                and actions under the key `Columns.OBS`.
             encoder: An `Encoder` model for the Q state-action encoder.
             head: A `Model` for the Q head.
 
@@ -158,8 +213,8 @@ def _qf_forward_train_helper(
         """
         # Construct batch. Note, we need to feed observations and actions.
         qf_batch = {
-            SampleBatch.OBS: torch.concat(
-                (batch[SampleBatch.OBS], batch[SampleBatch.ACTIONS]), dim=-1
+            Columns.OBS: torch.concat(
+                (batch[Columns.OBS], batch[Columns.ACTIONS]), dim=-1
             )
         }
         # Encoder forward pass.
diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
index 47c4d9aa7a5b..99bb11de6e18 100644
--- a/rllib/core/learner/learner.py
+++ b/rllib/core/learner/learner.py
@@ -52,6 +52,7 @@
     NUM_ENV_STEPS_SAMPLED_LIFETIME,
     NUM_ENV_STEPS_TRAINED,
     NUM_MODULE_STEPS_TRAINED,
+    LEARNER_CONNECTOR_TIMER,
 )
 from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
 from ray.rllib.utils.minibatch_utils import (
@@ -1252,22 +1253,23 @@ def _update_from_batch_or_episodes(
         # Call the learner connector.
         if self._learner_connector is not None and episodes is not None:
             # Call the learner connector pipeline.
-            shared_data = {}
-            batch = self._learner_connector(
-                rl_module=self.module,
-                data=batch if batch is not None else {},
-                episodes=episodes,
-                shared_data=shared_data,
-            )
-            # Convert to a batch.
-            # TODO (sven): Try to not require MultiAgentBatch anymore.
-            batch = MultiAgentBatch(
-                {
-                    module_id: SampleBatch(module_data)
-                    for module_id, module_data in batch.items()
-                },
-                env_steps=sum(len(e) for e in episodes),
-            )
+            with self.metrics.log_time((ALL_MODULES, LEARNER_CONNECTOR_TIMER)):
+                shared_data = {}
+                batch = self._learner_connector(
+                    rl_module=self.module,
+                    data=batch if batch is not None else {},
+                    episodes=episodes,
+                    shared_data=shared_data,
+                )
+                # Convert to a batch.
+                # TODO (sven): Try to not require MultiAgentBatch anymore.
+                batch = MultiAgentBatch(
+                    {
+                        module_id: SampleBatch(module_data)
+                        for module_id, module_data in batch.items()
+                    },
+                    env_steps=sum(len(e) for e in episodes),
+                )
         # Have to convert to MultiAgentBatch.
         elif isinstance(batch, SampleBatch):
             assert len(self.module) == 1
diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py
index 525e15a081c0..fdfcdf1f3938 100644
--- a/rllib/core/learner/learner_group.py
+++ b/rllib/core/learner/learner_group.py
@@ -490,6 +490,7 @@ def _learner_update(
                         partial(
                             _learner_update,
                             _episodes_shard=episodes_shard,
+                            _timesteps=timesteps,
                             _min_total_mini_batches=min_total_mini_batches,
                         )
                         for episodes_shard in episodes
@@ -529,6 +530,7 @@ def _learner_update(
                         partial(
                             _learner_update,
                             _episodes_shard=eps_shard,
+                            _timesteps=timesteps,
                             _min_total_mini_batches=min_total_mini_batches,
                         )
                         for eps_shard in eps_shards
diff --git a/rllib/core/learner/tests/test_learner_group.py b/rllib/core/learner/tests/test_learner_group.py
index 430c26c11b2b..ca51dffd7859 100644
--- a/rllib/core/learner/tests/test_learner_group.py
+++ b/rllib/core/learner/tests/test_learner_group.py
@@ -57,13 +57,8 @@ def local_training_helper(self, fw, scaling_mode) -> None:
             import torch
 
             torch.manual_seed(0)
-        elif fw == "tf2":
-            import tensorflow as tf
-
-            # this is done by rllib already inside of the policy class, but we need to
-            # do it here for testing purposes
-            tf.compat.v1.enable_eager_execution()
-            tf.random.set_seed(0)
+        else:
+            raise NotImplementedError
 
         env = gym.make("CartPole-v1")
 
@@ -215,7 +210,7 @@ def test_learner_group_build_from_algorithm_config(self):
         learner_group.shutdown()
 
     # def test_learner_group_local(self):
-    #    fws = ["torch", "tf2"]
+    #    fws = ["torch"]
 
     #    test_iterator = itertools.product(fws, LOCAL_CONFIGS)
 
@@ -231,7 +226,7 @@ def test_learner_group_build_from_algorithm_config(self):
     def test_update_multi_gpu(self):
         return
 
-        fws = ["torch", "tf2"]
+        fws = ["torch"]
         scaling_modes = ["multi-gpu-ddp", "remote-gpu"]
         test_iterator = itertools.product(fws, scaling_modes)
 
@@ -273,8 +268,8 @@ def test_update_multi_gpu(self):
             del learner_group
 
     def test_add_module_and_remove_module(self):
-        fws = ["torch", "tf2"]
-        scaling_modes = ["local-cpu", "multi-gpu-ddp"]
+        fws = ["torch"]
+        scaling_modes = ["local-cpu", "multi-cpu-ddp"]
         test_iterator = itertools.product(fws, scaling_modes)
 
         for fw, scaling_mode in test_iterator:
@@ -342,7 +337,7 @@ def tearDownClass(cls) -> None:
 
     def test_restore_from_path_multi_rl_module_and_individual_modules(self):
         """Tests whether MultiRLModule- and single RLModule states can be restored."""
-        fws = ["torch", "tf2"]
+        fws = ["torch"]
         # this is expanded to more scaling modes on the release ci.
         scaling_modes = ["local-cpu", "multi-gpu-ddp"]
 
@@ -450,7 +445,7 @@ def tearDownClass(cls) -> None:
 
     def test_save_to_path_and_restore_from_path(self):
         """Check that saving and loading learner group state works."""
-        fws = ["torch", "tf2"]
+        fws = ["torch"]
         # this is expanded to more scaling modes on the release ci.
         scaling_modes = ["local-cpu", "multi-gpu-ddp"]
         test_iterator = itertools.product(fws, scaling_modes)
@@ -542,7 +537,7 @@ def tearDown(cls) -> None:
 
     def test_async_update(self):
         """Test that async style updates converge to the same result as sync."""
-        fws = ["torch", "tf2"]
+        fws = ["torch"]
         # async_update only needs to be tested for the most complex case.
         # so we'll only test it for multi-gpu-ddp.
         scaling_modes = ["multi-gpu-ddp", "remote-gpu"]
diff --git a/rllib/tuned_examples/dqn/cartpole_dqn.py b/rllib/tuned_examples/dqn/cartpole_dqn.py
index 6306e4bbab19..1955ea7764d0 100644
--- a/rllib/tuned_examples/dqn/cartpole_dqn.py
+++ b/rllib/tuned_examples/dqn/cartpole_dqn.py
@@ -1,13 +1,10 @@
 from ray.rllib.algorithms.dqn import DQNConfig
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
-from ray.rllib.utils.metrics import (
-    ENV_RUNNER_RESULTS,
-    EPISODE_RETURN_MEAN,
-    EVALUATION_RESULTS,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME,
-)
 
-parser = add_rllib_example_script_args()
+parser = add_rllib_example_script_args(
+    default_reward=450.0,
+    default_timesteps=200000,
+)
 parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values toset up `config` below.
@@ -15,20 +12,13 @@
 
 config = (
     DQNConfig()
-    .environment(env="CartPole-v1")
-    .rl_module(
-        # Settings identical to old stack.
-        model_config_dict={
-            "fcnet_hiddens": [256],
-            "fcnet_activation": "tanh",
-            "epsilon": [(0, 1.0), (10000, 0.02)],
-            "fcnet_bias_initializer": "zeros_",
-            "post_fcnet_bias_initializer": "zeros_",
-            "post_fcnet_hiddens": [256],
-        },
+    .api_stack(
+        enable_rl_module_and_learner=True,
+        enable_env_runner_and_connector_v2=True,
     )
+    .environment(env="CartPole-v1")
     .training(
-        # Settings identical to old stack.
+        lr=0.0005 * (args.num_gpus or 1) ** 0.5,
         train_batch_size_per_learner=32,
         replay_buffer_config={
             "type": "PrioritizedEpisodeReplayBuffer",
@@ -36,35 +26,27 @@
             "alpha": 0.6,
             "beta": 0.4,
         },
-        n_step=3,
+        n_step=(2, 5),
         double_q=True,
         num_atoms=1,
         noisy=False,
         dueling=True,
     )
-    .evaluation(
-        evaluation_interval=1,
-        evaluation_parallel_to_training=True,
-        evaluation_num_env_runners=1,
-        evaluation_duration="auto",
-        evaluation_config={
-            "explore": False,
-            # TODO (sven): Add support for window=float(inf) and reduce=mean for
-            #  evaluation episode_return_mean reductions (identical to old stack
-            #  behavior, which does NOT use a window (100 by default) to reduce
-            #  eval episode returns.
-            "metrics_num_episodes_for_smoothing": 4,
+    .rl_module(
+        # Settings identical to old stack.
+        model_config_dict={
+            "fcnet_hiddens": [256],
+            "fcnet_activation": "tanh",
+            "epsilon": [(0, 1.0), (10000, 0.02)],
+            "fcnet_bias_initializer": "zeros_",
+            "post_fcnet_bias_initializer": "zeros_",
+            "post_fcnet_hiddens": [256],
         },
     )
 )
 
-stop = {
-    f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 500.0,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME: 100000,
-}
-
 
 if __name__ == "__main__":
     from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
 
-    run_rllib_example_script_experiment(config, args, stop=stop)
+    run_rllib_example_script_experiment(config, args)
diff --git a/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py b/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py
index 5a6f763b94f2..94aac4c2c8f0 100644
--- a/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py
+++ b/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py
@@ -9,12 +9,17 @@
 
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
-parser = add_rllib_example_script_args()
-parser.set_defaults(num_agents=2)
+parser = add_rllib_example_script_args(
+    default_timesteps=500000,
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values to set up `config` below.
 args = parser.parse_args()
-parser.set_defaults(num_agents=2)
+
 register_env(
     "multi_agent_cartpole",
     lambda _: MultiAgentCartPole({"num_agents": args.num_agents}),
@@ -22,9 +27,13 @@
 
 config = (
     DQNConfig()
+    .api_stack(
+        enable_rl_module_and_learner=True,
+        enable_env_runner_and_connector_v2=True,
+    )
     .environment(env="multi_agent_cartpole")
     .training(
-        # Settings identical to old stack.
+        lr=0.0005 * (args.num_gpus or 1) ** 0.5,
         train_batch_size_per_learner=32,
         replay_buffer_config={
             "type": "MultiAgentPrioritizedEpisodeReplayBuffer",
@@ -32,7 +41,7 @@
             "alpha": 0.6,
             "beta": 0.4,
         },
-        n_step=3,
+        n_step=(2, 5),
         double_q=True,
         num_atoms=1,
         noisy=False,
@@ -57,19 +66,17 @@
     )
 
 stop = {
-    NUM_ENV_STEPS_SAMPLED_LIFETIME: 500000,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
     # `episode_return_mean` is the sum of all agents/policies' returns.
     f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 250.0 * args.num_agents,
 }
 
 if __name__ == "__main__":
+
+    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
     assert (
         args.num_agents > 0
     ), "The `--num-agents` arg must be > 0 for this script to work."
-    assert (
-        args.enable_new_api_stack
-    ), "The `--enable-new-api-stack` arg must be activated for this script to work."
-
-    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
 
     run_rllib_example_script_experiment(config, args, stop=stop)
diff --git a/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml b/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml
deleted file mode 100644
index e31b4aaa6669..000000000000
--- a/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# @OldAPIStack
-cartpole-sac:
-    env: CartPoleContinuousBulletEnv-v0
-    run: SAC
-    stop:
-        env_runners/episode_return_mean: 40
-        timesteps_total: 100000
-    config:
-        # Works for both torch and tf.
-        framework: torch
-        gamma: 0.95
-        n_step: 3
-        replay_buffer_config:
-          type: MultiAgentPrioritizedReplayBuffer
-        num_steps_sampled_before_learning_starts: 256
-        initial_alpha: 0.2
-        clip_actions: false
-        min_sample_timesteps_per_iteration: 1000
-        optimization:
-            actor_learning_rate: 0.005
-            critic_learning_rate: 0.005
-            entropy_learning_rate: 0.0001
diff --git a/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml b/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml
deleted file mode 100644
index f5307df86b15..000000000000
--- a/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-# @OldAPIStack
-halfcheetah-pybullet-sac:
-    env: HalfCheetahBulletEnv-v0
-    run: SAC
-    stop:
-        env_runners/episode_return_mean: 800.0
-    config:
-        # Works for both torch and tf.
-        framework: torch
-        q_model_config:
-          fcnet_activation: relu
-          fcnet_hiddens: [256, 256]
-        policy_model_config:
-          fcnet_activation: relu
-          fcnet_hiddens: [256, 256]
-        tau: 0.005
-        target_entropy: auto
-        n_step: 3
-        rollout_fragment_length: 1
-        train_batch_size: 256
-        target_network_update_freq: 1
-        min_sample_timesteps_per_iteration: 1000
-        replay_buffer_config:
-          type: MultiAgentPrioritizedReplayBuffer
-        num_steps_sampled_before_learning_starts: 10000
-        optimization:
-          actor_learning_rate: 0.0003
-          critic_learning_rate: 0.0003
-          entropy_learning_rate: 0.0003
-        num_env_runners: 0
-        num_gpus: 1
-        metrics_num_episodes_for_smoothing: 5
-
diff --git a/rllib/tuned_examples/sac/halfcheetah-sac.yaml b/rllib/tuned_examples/sac/halfcheetah-sac.yaml
deleted file mode 100644
index c1804440a0d3..000000000000
--- a/rllib/tuned_examples/sac/halfcheetah-sac.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# @OldAPIStack
-# Our implementation of SAC can reach 9k reward in 400k timesteps
-halfcheetah_sac:
-    env: HalfCheetah-v3
-    run: SAC
-    stop:
-        env_runners/episode_return_mean: 9000
-    config:
-        # Works for both torch and tf.
-        framework: torch
-        q_model_config:
-          fcnet_activation: relu
-          fcnet_hiddens: [256, 256]
-        policy_model_config:
-          fcnet_activation: relu
-          fcnet_hiddens: [256, 256]
-        tau: 0.005
-        target_entropy: auto
-        n_step: 1
-        rollout_fragment_length: 1
-        train_batch_size: 256
-        target_network_update_freq: 1
-        min_sample_timesteps_per_iteration: 1000
-        replay_buffer_config:
-          type: MultiAgentPrioritizedReplayBuffer
-        num_steps_sampled_before_learning_starts: 10000
-        optimization:
-          actor_learning_rate: 0.0003
-          critic_learning_rate: 0.0003
-          entropy_learning_rate: 0.0003
-        num_env_runners: 0
-        num_gpus: 0
-        clip_actions: false
-        normalize_actions: true
-        evaluation_interval: 1
-        metrics_num_episodes_for_smoothing: 5
-
diff --git a/rllib/tuned_examples/sac/halfcheetah_sac.py b/rllib/tuned_examples/sac/halfcheetah_sac.py
new file mode 100644
index 000000000000..5a3bb9b3a7a4
--- /dev/null
+++ b/rllib/tuned_examples/sac/halfcheetah_sac.py
@@ -0,0 +1,62 @@
+from torch import nn
+
+from ray.rllib.algorithms.sac.sac import SACConfig
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+
+parser = add_rllib_example_script_args(
+    default_timesteps=1000000,
+    default_reward=12000.0,
+    default_iters=2000,
+)
+parser.set_defaults(enable_new_api_stack=True)
+# Use `parser` to add your own custom command line options to this script
+# and (if needed) use their values to set up `config` below.
+args = parser.parse_args()
+
+config = (
+    SACConfig()
+    .api_stack(
+        enable_rl_module_and_learner=True,
+        enable_env_runner_and_connector_v2=True,
+    )
+    .environment("HalfCheetah-v4")
+    .training(
+        initial_alpha=1.001,
+        # lr=0.0006 is very high, w/ 4 GPUs -> 0.0012
+        # Might want to lower it for better stability, but it does learn well.
+        lr=0.0004 * (args.num_gpus or 1) ** 0.5,
+        target_entropy="auto",
+        n_step=(1, 5),  # 1?
+        tau=0.005,
+        train_batch_size_per_learner=256,
+        target_network_update_freq=1,
+        replay_buffer_config={
+            "type": "PrioritizedEpisodeReplayBuffer",
+            "capacity": 100000,
+            "alpha": 0.6,
+            "beta": 0.4,
+        },
+        num_steps_sampled_before_learning_starts=10000,
+    )
+    .rl_module(
+        model_config_dict={
+            "fcnet_hiddens": [256, 256],
+            "fcnet_activation": "relu",
+            "fcnet_weights_initializer": nn.init.xavier_uniform_,
+            "post_fcnet_hiddens": [],
+            "post_fcnet_activation": None,
+            "post_fcnet_weights_initializer": "orthogonal_",
+            "post_fcnet_weights_initializer_config": {"gain": 0.01},
+        }
+    )
+    .reporting(
+        metrics_num_episodes_for_smoothing=5,
+        min_sample_timesteps_per_iteration=1000,
+    )
+)
+
+
+if __name__ == "__main__":
+    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
+    run_rllib_example_script_experiment(config, args)
diff --git a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py
index 59c1fa5bbf36..de3669c9bb29 100644
--- a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py
+++ b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py
@@ -1,6 +1,7 @@
+from torch import nn
+
 from ray.rllib.algorithms.sac import SACConfig
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
-from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
     EPISODE_RETURN_MEAN,
@@ -9,10 +10,14 @@
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 from ray.tune.registry import register_env
 
-torch, nn = try_import_torch()
 
-parser = add_rllib_example_script_args()
-parser.set_defaults(num_agents=2)
+parser = add_rllib_example_script_args(
+    default_timesteps=500000,
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values to set up `config` below.
 args = parser.parse_args()
@@ -24,12 +29,16 @@
 
 config = (
     SACConfig()
-    .environment(env="multi_agent_pendulum")
+    .api_stack(
+        enable_rl_module_and_learner=True,
+        enable_env_runner_and_connector_v2=True,
+    )
+    .environment("multi_agent_pendulum")
     .training(
         initial_alpha=1.001,
-        lr=8e-4,
+        lr=0.001 * ((args.num_gpus or 1) ** 0.5),
         target_entropy="auto",
-        n_step=1,
+        n_step=(2, 5),
         tau=0.005,
         train_batch_size_per_learner=256,
         target_network_update_freq=1,
@@ -44,17 +53,16 @@
     .rl_module(
         model_config_dict={
             "fcnet_hiddens": [256, 256],
-            "fcnet_activation": "tanh",
+            "fcnet_activation": "relu",
             "fcnet_weights_initializer": nn.init.xavier_uniform_,
-            # "post_fcnet_hiddens": [],
-            # "post_fcnet_activation": None,
-            # "post_fcnet_weights_initializer": nn.init.orthogonal_,
-            # "post_fcnet_weights_initializer_config": {"gain": 0.01},
+            "post_fcnet_hiddens": [],
+            "post_fcnet_activation": None,
+            "post_fcnet_weights_initializer": nn.init.orthogonal_,
+            "post_fcnet_weights_initializer_config": {"gain": 0.01},
         }
     )
     .reporting(
         metrics_num_episodes_for_smoothing=5,
-        min_sample_timesteps_per_iteration=1000,
     )
 )
 
@@ -65,7 +73,7 @@
     )
 
 stop = {
-    NUM_ENV_STEPS_SAMPLED_LIFETIME: 500000,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
     # `episode_return_mean` is the sum of all agents/policies' returns.
     f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0 * args.num_agents,
 }
@@ -74,10 +82,6 @@
     assert (
         args.num_agents > 0
     ), "The `--num-agents` arg must be > 0 for this script to work."
-    assert (
-        args.enable_new_api_stack
-    ), "The `--enable-new-api-stack` arg must be activated for this script to work."
-
     from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
 
     run_rllib_example_script_experiment(config, args, stop=stop)
diff --git a/rllib/tuned_examples/sac/pendulum_sac.py b/rllib/tuned_examples/sac/pendulum_sac.py
index 5be9332f995e..69746c4478e9 100644
--- a/rllib/tuned_examples/sac/pendulum_sac.py
+++ b/rllib/tuned_examples/sac/pendulum_sac.py
@@ -1,3 +1,5 @@
+from torch import nn
+
 from ray.rllib.algorithms.sac.sac import SACConfig
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 
@@ -5,6 +7,7 @@
     default_timesteps=20000,
     default_reward=-250.0,
 )
+parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values to set up `config` below.
 args = parser.parse_args()
@@ -15,12 +18,12 @@
         enable_rl_module_and_learner=True,
         enable_env_runner_and_connector_v2=True,
     )
-    .environment(env="Pendulum-v1")
+    .environment("Pendulum-v1")
     .training(
         initial_alpha=1.001,
-        lr=3e-4,
+        lr=0.001 * (args.num_gpus or 1) ** 0.5,
         target_entropy="auto",
-        n_step=1,
+        n_step=(2, 5),
         tau=0.005,
         train_batch_size_per_learner=256,
         target_network_update_freq=1,
@@ -30,12 +33,13 @@
             "alpha": 1.0,
             "beta": 0.0,
         },
-        num_steps_sampled_before_learning_starts=256,
+        num_steps_sampled_before_learning_starts=256 * (args.num_gpus or 1),
     )
     .rl_module(
         model_config_dict={
             "fcnet_hiddens": [256, 256],
             "fcnet_activation": "relu",
+            "fcnet_weights_initializer": nn.init.xavier_uniform_,
             "post_fcnet_hiddens": [],
             "post_fcnet_activation": None,
             "post_fcnet_weights_initializer": "orthogonal_",
diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py
index 33d9c592af42..2cbac3f12a5e 100644
--- a/rllib/utils/metrics/__init__.py
+++ b/rllib/utils/metrics/__init__.py
@@ -95,10 +95,12 @@
 SAMPLE_TIMER = "sample"  # @OldAPIStack
 ENV_RUNNER_SAMPLING_TIMER = "env_runner_sampling_timer"
 OFFLINE_SAMPLING_TIMER = "offline_sampling_timer"
+REPLAY_BUFFER_ADD_DATA_TIMER = "replay_buffer_add_data_timer"
 REPLAY_BUFFER_SAMPLE_TIMER = "replay_buffer_sampling_timer"
 REPLAY_BUFFER_UPDATE_PRIOS_TIMER = "replay_buffer_update_prios_timer"
 LEARNER_UPDATE_TIMER = "learner_update_timer"
 LEARN_ON_BATCH_TIMER = "learn"  # @OldAPIStack
+LEARNER_CONNECTOR_TIMER = "learner_connector_timer"
 LOAD_BATCH_TIMER = "load"
 TARGET_NET_UPDATE_TIMER = "target_net_update"
 
diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py
index 8c576234e436..c3a75589bc19 100644
--- a/rllib/utils/minibatch_utils.py
+++ b/rllib/utils/minibatch_utils.py
@@ -122,7 +122,6 @@ def get_len(b):
                         )
 
                 else:
-                    # n_steps = self._minibatch_size
 
                     def get_len(b):
                         return len(b)