From 28679cace13f554ad72e2b9f0e1a2ec5f52bd61c Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 10 Jan 2023 10:35:32 -0800
Subject: [PATCH 001/112] added quick cleanups to trainer_runner.

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/torch/__init__.py       |  0
 .../core/rl_trainer/torch/torch_rl_trainer.py | 10 ++++
 rllib/core/rl_trainer/trainer_runner.py       | 49 ++++++++++---------
 3 files changed, 35 insertions(+), 24 deletions(-)
 create mode 100644 rllib/core/rl_trainer/torch/__init__.py
 create mode 100644 rllib/core/rl_trainer/torch/torch_rl_trainer.py

diff --git a/rllib/core/rl_trainer/torch/__init__.py b/rllib/core/rl_trainer/torch/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
new file mode 100644
index 000000000000..ea7c1e41b890
--- /dev/null
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -0,0 +1,10 @@
+import logging
+from rllib.core.rl_trainer.rl_trainer import RLTrainer
+
+logger = logging.getLogger(__name__)
+
+# TODO: Implement this
+
+
+class TorchRLTrainer(RLTrainer):
+    pass
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 9f1c44ee6e13..3a0717025e4a 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -9,7 +9,11 @@
     ParamOptimizerPairs,
     Optimizer,
 )
+from ray.rllib.core.rl_trainer.tf.tf_rl_trainer import TfRLTrainer
+from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
 from ray.rllib.policy.sample_batch import MultiAgentBatch
+
+
 from ray.air.config import ScalingConfig
 from ray.train._internal.backend_executor import BackendExecutor
 
@@ -17,7 +21,9 @@
 class TrainerRunner:
     """Coordinator of RLTrainers.
     Public API:
-        .update()
+        .update(batch) -> updates the RLModule based on gradient descent algos.
+        .additional_update() -> any additional non-gradient based updates will get
+                                called from this entry point.
         .get_state() -> returns the state of the RLModule and RLOptimizer from
                         all of the RLTrainers
         .set_state() -> sets the state of all the RLTrainers
@@ -37,9 +43,10 @@ def __init__(
         trainer_class: Type[RLTrainer],
         trainer_config: Mapping[str, Any],
         compute_config: Mapping[str, Any],
-        framework: str = "tf",
     ):
-        """ """
+        # TODO: trainer_config and compute_config should become dataclasses.
+        # It's hard for the user to know what the trainer / compute parameters are
+        # expected.
         self._trainer_config = trainer_config
         self._compute_config = compute_config
 
@@ -51,11 +58,11 @@ def __init__(
             use_gpu=resources["use_gpu"],
         )
         # the only part of this class that is framework agnostic:
-        if framework == "torch":
+        if issubclass(trainer_class, TorchRLTrainer):
             from ray.train.torch import TorchConfig
 
             backend_config = TorchConfig()
-        elif framework == "tf":
+        elif issubclass(trainer_class, TfRLTrainer):
             from ray.train.tensorflow import TensorflowConfig
 
             backend_config = TensorflowConfig()
@@ -101,28 +108,22 @@ def _compute_necessary_resources(self):
         return {"num_workers": num_workers, "use_gpu": bool(num_gpus)}
 
     def update(self, batch: MultiAgentBatch = None, **kwargs):
-        """TODO: account for **kwargs
-        Example in DQN:
+        """
+        Example:
             >>> trainer_runner.update(batch) # updates the gradient
-            >>> trainer_runner.update(update_target=True) # should soft-update
-                the target network
         """
         refs = []
-        if batch is None:
-            for worker in self.workers:
-                refs.append(worker.update.remote(**kwargs))
-        else:
-            global_size = len(self.workers)
-            batch_size = math.ceil(len(batch) / global_size)
-            for i, worker in enumerate(self.workers):
-                batch_to_send = {}
-                for pid, sub_batch in batch.policy_batches.items():
-                    batch_size = math.ceil(len(sub_batch) / global_size)
-                    start = batch_size * i
-                    end = min(start + batch_size, len(sub_batch))
-                    batch_to_send[pid] = sub_batch[int(start) : int(end)]
-                new_batch = MultiAgentBatch(batch_to_send, int(batch_size))
-                refs.append(worker.update.remote(new_batch))
+        global_size = len(self.workers)
+        batch_size = math.ceil(len(batch) / global_size)
+        for i, worker in enumerate(self.workers):
+            batch_to_send = {}
+            for pid, sub_batch in batch.policy_batches.items():
+                batch_size = math.ceil(len(sub_batch) / global_size)
+                start = batch_size * i
+                end = min(start + batch_size, len(sub_batch))
+                batch_to_send[pid] = sub_batch[int(start) : int(end)]
+            new_batch = MultiAgentBatch(batch_to_send, int(batch_size))
+            refs.append(worker.update.remote(new_batch))
 
         return ray.get(refs)
 

From a2f9439a4ffa26449767555e1392c08bc59eba2a Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 10 Jan 2023 10:43:10 -0800
Subject: [PATCH 002/112] created test_trainer_runner

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD                                   |   9 +-
 .../rl_trainer/tests/test_trainer_runner.py   | 153 ++++++++++++++++++
 2 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 rllib/core/rl_trainer/tests/test_trainer_runner.py

diff --git a/rllib/BUILD b/rllib/BUILD
index b5c02be6a281..d418c099ed70 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1818,7 +1818,7 @@ py_test(
     srcs = ["core/rl_module/tests/test_marl_module.py"]
 )
 
-
+# to be removed
 py_test(
     name = "test_tf_rl_trainer",
     tags = ["team:rllib", "multi_gpu", "exclusive"],
@@ -1826,6 +1826,13 @@ py_test(
     srcs = ["core/rl_trainer/tests/tf/test_tf_rl_trainer.py"]
 )
 
+py_test(
+    name = "test_trainer_runner",
+    tags = ["team:rllib", "multi_gpu", "exclusive"],
+    size = "medium",
+    srcs = ["core/rl_trainer/tests/test_trainer_runner.py"]
+)
+
 # --------------------------------------------------------------------
 # Models and Distributions
 # rllib/models/
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
new file mode 100644
index 000000000000..e64c143d7b32
--- /dev/null
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -0,0 +1,153 @@
+import gymnasium as gym
+import unittest
+
+import tensorflow as tf
+import ray
+
+from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
+from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
+from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
+from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
+
+
+class TestTrainerRunner(unittest.TestCase):
+    """This test is setup for 2 gpus."""
+
+    # TODO: Make a unittest that does not need 2 gpus to run.
+    # So that the user can run it locally as well.
+    @classmethod
+    def setUp(cls) -> None:
+        ray.init()
+
+    @classmethod
+    def tearDown(cls) -> None:
+        ray.shutdown()
+
+    def test_update_multigpu(self):
+        """Test training in a 2 gpu setup and that weights are synchronized."""
+        env = gym.make("CartPole-v1")
+        trainer_class = BCTfRLTrainer
+        trainer_cfg = dict(
+            module_class=DiscreteBCTFModule,
+            module_kwargs={
+                "observation_space": env.observation_space,
+                "action_space": env.action_space,
+                "model_config": {"hidden_dim": 32},
+            },
+            optimizer_config={"lr": 1e-3},
+            in_test=True,
+        )
+        runner = TrainerRunner(
+            trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
+        )
+
+        reader = get_cartpole_dataset_reader(batch_size=500)
+
+        min_loss = float("inf")
+        for iter_i in range(1000):
+            batch = reader.next()
+            results_worker_0, results_worker_1 = runner.update(batch.as_multi_agent())
+
+            loss = (
+                results_worker_0["loss"]["total_loss"]
+                + results_worker_1["loss"]["total_loss"]
+            ) / 2
+            min_loss = min(loss, min_loss)
+            print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}")
+            # The loss is initially around 0.69 (ln2). When it gets to around
+            # 0.57 the return of the policy gets to around 100.
+            if min_loss < 0.57:
+                break
+            self.assertEqual(
+                results_worker_0["mean_weight"]["default_policy"],
+                results_worker_1["mean_weight"]["default_policy"],
+            )
+        self.assertLess(min_loss, 0.57)
+
+    def test_add_remove_module(self):
+        env = gym.make("CartPole-v1")
+        trainer_class = BCTfRLTrainer
+        trainer_cfg = dict(
+            module_class=DiscreteBCTFModule,
+            module_kwargs={
+                "observation_space": env.observation_space,
+                "action_space": env.action_space,
+                "model_config": {"hidden_dim": 32},
+            },
+            optimizer_config={"lr": 1e-3},
+            in_test=True,
+        )
+        runner = TrainerRunner(
+            trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
+        )
+
+        reader = get_cartpole_dataset_reader(batch_size=500)
+        batch = reader.next()
+
+        # update once with the default policy
+        results = runner.update(batch.as_multi_agent())
+        module_ids_before_add = {DEFAULT_POLICY_ID}
+        new_module_id = "test_module"
+
+        # add a test_module
+        runner.add_module(
+            module_id=new_module_id,
+            module_cls=DiscreteBCTFModule,
+            module_kwargs={
+                "observation_space": env.observation_space,
+                "action_space": env.action_space,
+                "model_config": {"hidden_dim": 32},
+            },
+            optimizer_cls=tf.keras.optimizers.Adam,
+        )
+
+        # do training that includes the test_module
+        results = runner.update(
+            MultiAgentBatch(
+                {new_module_id: batch, DEFAULT_POLICY_ID: batch}, batch.count
+            )
+        )
+
+        # check that module weights are updated across workers and synchronized
+        for i in range(1, len(results)):
+            for module_id in results[i]["mean_weight"].keys():
+                assert (
+                    results[i]["mean_weight"][module_id]
+                    == results[i - 1]["mean_weight"][module_id]
+                )
+
+        # check that module ids are updated to include the new module
+        module_ids_after_add = {DEFAULT_POLICY_ID, new_module_id}
+        for result in results:
+            # remove the total_loss key since its not a module key
+            self.assertEqual(set(result["loss"]) - {"total_loss"}, module_ids_after_add)
+
+        # remove the test_module
+        runner.remove_module(module_id=new_module_id)
+
+        # run training without the test_module
+        results = runner.update(batch.as_multi_agent())
+
+        # check that module weights are updated across workers and synchronized
+        for i in range(1, len(results)):
+            for module_id in results[i]["mean_weight"].keys():
+                assert (
+                    results[i]["mean_weight"][module_id]
+                    == results[i - 1]["mean_weight"][module_id]
+                )
+
+        # check that module ids are updated after remove operation to not
+        # include the new module
+        for result in results:
+            # remove the total_loss key since its not a module key
+            self.assertEqual(
+                set(result["loss"]) - {"total_loss"}, module_ids_before_add
+            )
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))

From d8b36c10103b790dd4dfa6ee86db5a7d9b2d8b46 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 10 Jan 2023 11:05:19 -0800
Subject: [PATCH 003/112] added TODO tag

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index d418c099ed70..751fa6901cca 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1818,7 +1818,7 @@ py_test(
     srcs = ["core/rl_module/tests/test_marl_module.py"]
 )
 
-# to be removed
+# TODO: to be removed
 py_test(
     name = "test_tf_rl_trainer",
     tags = ["team:rllib", "multi_gpu", "exclusive"],

From 2b6757749890c4944cfa6db674abd98a7cd43190 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 11 Jan 2023 14:01:27 -0800
Subject: [PATCH 004/112] fixed imports

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/torch/torch_rl_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index ea7c1e41b890..6b27ed14ab37 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -1,5 +1,5 @@
 import logging
-from rllib.core.rl_trainer.rl_trainer import RLTrainer
+from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
 
 logger = logging.getLogger(__name__)
 

From fe60e207f7ec7949082d75596f72b56951924523 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 11 Jan 2023 14:54:21 -0800
Subject: [PATCH 005/112] typo in BUILD

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rllib/BUILD b/rllib/BUILD
index 77e0204c3fbf..ca91ad831a34 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1831,6 +1831,7 @@ py_test(
     tags = ["team:rllib", "multi_gpu", "exclusive"],
     size = "medium",
     srcs = ["core/rl_trainer/tests/test_trainer_runner.py"]
+)
 
 py_test(
     name = "test_rl_trainer",

From 916d67432788e9ff4261f793c2875fe28f785cd4 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 11 Jan 2023 21:29:24 -0800
Subject: [PATCH 006/112] started to create torch_rl_trainer

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           |  15 +-
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |   5 +-
 .../core/rl_trainer/torch/torch_rl_trainer.py | 130 +++++++++++++++++-
 3 files changed, 140 insertions(+), 10 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 1e4dedffa99d..ce6b73f78349 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -234,13 +234,16 @@ def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
             A dictionary of results.
         """
         if not self.distributed:
-            fwd_out = self._module.forward_train(batch)
-            loss = self.compute_loss(fwd_out=fwd_out, batch=batch)
-            gradients = self.compute_gradients(loss)
-            post_processed_gradients = self.on_after_compute_gradients(gradients)
-            self.apply_gradients(post_processed_gradients)
+            return self._update(batch)
         else:
-            self.do_distributed_update(batch)
+            return self.do_distributed_update(batch)
+    
+    def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
+        fwd_out = self._module.forward_train(batch)
+        loss = self.compute_loss(fwd_out=fwd_out, batch=batch)
+        gradients = self.compute_gradients(loss)
+        post_processed_gradients = self.on_after_compute_gradients(gradients)
+        self.apply_gradients(post_processed_gradients)
         return self.compile_results(batch, fwd_out, loss, post_processed_gradients)
 
     def additional_update(self, *args, **kwargs) -> Mapping[str, Any]:
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index e79686753c6f..e39362d2f94f 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -14,6 +14,7 @@
 
 from ray.rllib.core.rl_trainer.rl_trainer import (
     RLTrainer,
+    MultiAgentRLModule,
     ParamOptimizerPairs,
     ParamRef,
     Optimizer,
@@ -79,7 +80,7 @@ def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
     def compute_gradients(
         self, loss: Union[TensorType, Mapping[str, Any]], tape: tf.GradientTape
     ) -> ParamDictType:
-        grads = tape.gradient(loss["total_loss"], self._params)
+        grads = tape.gradient(loss[self.TOTAL_LOSS_KEY], self._params)
         return grads
 
     @override(RLTrainer)
@@ -90,7 +91,7 @@ def apply_gradients(self, gradients: Dict[ParamRef, TensorType]) -> None:
             optim.apply_gradients(zip(gradient_list, variable_list))
 
     @override(RLTrainer)
-    def _make_distributed(self) -> RLModule:
+    def _make_distributed(self) -> MultiAgentRLModule:
         # TODO: Does strategy has to be an attribute here? if so it's very hidden to
         # the user of this class that there is such an attribute.
 
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 6b27ed14ab37..a9acc5af9768 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -1,10 +1,136 @@
 import logging
-from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
+import numpy as np
+from typing import (
+    Any,
+    Mapping,
+    Union,
+    Type,
+    Optional,
+    Callable,
+    Dict,
+    Sequence,
+    Hashable,
+)
+
+from ray.rllib.utils.annotations import override
+from ray.rllib.core.rl_module import RLModule
+from ray.rllib.core.rl_trainer.rl_trainer import (
+    RLTrainer,
+    MultiAgentRLModule,
+    ParamOptimizerPairs,
+    ParamRef,
+    Optimizer,
+    ParamType,
+    ParamDictType,
+)
+from ray.rllib.policy.sample_batch import MultiAgentBatch
+from ray.rllib.utils.typing import TensorType
+
 
 logger = logging.getLogger(__name__)
 
 # TODO: Implement this
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+import os
+import ray
 
 
 class TorchRLTrainer(RLTrainer):
-    pass
+
+    def __init__(
+        self, 
+        module_class: Union[Type[RLModule], Type[MultiAgentRLModule]],
+        module_kwargs: Mapping[str, Any], 
+        scaling_config: Mapping[str, Any], 
+        optimizer_config: Mapping[str, Any], 
+        distributed: bool = False, 
+        in_test: bool = False
+    ):
+        super().__init__(
+            module_class=module_class, 
+            module_kwargs=module_kwargs, 
+            scaling_config=scaling_config, 
+            optimizer_config=optimizer_config, 
+            distributed=distributed, 
+            in_test=in_test
+        )
+
+        gpu_ids = ray.get_gpu_ids()
+        self._world_size = scaling_config.get("num_workers", 1)
+        self._gpu_id = gpu_ids[0] if gpu_ids else None
+
+    @override(RLTrainer)
+    def module(self) -> MultiAgentRLModule:
+        if self.distributed:
+            return self._module.module
+        return self._module
+
+    @override(RLTrainer)
+    def configure_optimizers(self) -> ParamOptimizerPairs:
+        lr = self.optimizer_config.get("lr", 1e-3)
+        return [
+            (
+                self._module[key].parmaeters(),
+                torch.optim.Adam(self._module[key].parmaeters(), lr=lr),
+            )
+            for key in self._module.keys()
+        ]
+
+    @override(RLTrainer)
+    def compute_gradients(self, loss: Union[TensorType, Mapping[str, Any]]) -> ParamDictType:
+        for optim in self._optim_to_param:
+            optim.zero_grad(set_to_none=True)   
+        loss[self.TOTAL_LOSS_KEY].backward()
+        grads = {pid: p.grad for pid, p in self._params}
+        return grads
+
+    @override(RLTrainer)
+    def apply_gradients(self, gradients: ParamDictType) -> None:
+
+        # make sure the parameters do not carry gradients on their own
+        for optim in self._optim_to_param:
+            optim.zero_grad(set_to_none=True) 
+        
+        # set the gradient of the parameters
+        for pid, grad in gradients.items():
+            self._params[pid].grad = grad
+        
+        # for each optimizer call its step function with the gradients
+        for optim in self._optim_to_param:
+            optim.step()
+    
+    @override(RLTrainer)
+    def _make_distributed(self) -> MultiAgentRLModule:
+        module = self._make_module()
+        pg = torch.distributed.new_group(list(range(self._world_size)))
+        if self._gpu_id is not None:
+            module.to(self._gpu_id)
+            module = DDP(module, device_ids=[self._gpu_id], process_group=pg)
+        else:
+            module = DDP(module, process_group=pg)
+        return module
+    
+    @override(RLTrainer)
+    def do_distributed_update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
+        # in torch the distributed update is no different than the normal update
+        return self._update(batch)
+
+    @override(RLTrainer)
+    def get_param_ref(self, param: ParamType) -> Hashable:
+        return param
+
+    @override(RLTrainer)
+    def get_parameters(self, module: RLModule) -> Sequence[ParamType]:
+        return list(module.parameters())
+
+    @override(RLTrainer)
+    def get_optimizer_obj(
+        self, module: RLModule, optimizer_cls: Type[Optimizer]
+    ) -> Optimizer:
+        # TODO: the abstraction should take in optimizer_config as a parameter as well.
+        lr = self.optimizer_config.get("lr", 1e-3)
+        return optimizer_cls(module.parameters, lr=lr)
+    

From 71026e514300d826f4cf18ed86bded6baf859368 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 11 Jan 2023 21:33:41 -0800
Subject: [PATCH 007/112] added bc_rl_trainer

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/testing/torch/bc_rl_trainer.py | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 rllib/core/testing/torch/bc_rl_trainer.py

diff --git a/rllib/core/testing/torch/bc_rl_trainer.py b/rllib/core/testing/torch/bc_rl_trainer.py
new file mode 100644
index 000000000000..82db03d0aeae
--- /dev/null
+++ b/rllib/core/testing/torch/bc_rl_trainer.py
@@ -0,0 +1,29 @@
+import torch
+from typing import Any, Mapping
+
+from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import MultiAgentBatch
+
+
+class BCTorchRLTrainer(TorchRLTrainer):
+    def compute_loss(
+        self, fwd_out: MultiAgentBatch, batch: MultiAgentBatch
+    ) -> Mapping[str, Any]:
+
+        loss_dict = {}
+        loss_total = None
+        for module_id in fwd_out:
+            action_dist = fwd_out[module_id]["action_dist"]
+            loss = -torch.mean(
+                action_dist.log_prob(batch[module_id][SampleBatch.ACTIONS])
+            )
+            loss_dict[module_id] = loss
+            if loss_total is None:
+                loss_total = loss
+            else:
+                loss_total += loss
+
+        loss_dict[self.TOTAL_LOSS_KEY] = loss_total
+
+        return loss_dict

From ae610142ee1b498517d50f5f988a51ada1c62ab3 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 11 Jan 2023 22:12:12 -0800
Subject: [PATCH 008/112] torch trainer test works now

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           |  18 +-
 .../rl_trainer/tests/test_torch_rl_trainer.py | 163 ++++++++++++++++++
 .../core/rl_trainer/torch/torch_rl_trainer.py |   7 +-
 rllib/utils/test_utils.py                     |   2 +-
 4 files changed, 180 insertions(+), 10 deletions(-)
 create mode 100644 rllib/core/rl_trainer/tests/test_torch_rl_trainer.py

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index ce6b73f78349..3ec5c4039394 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -216,13 +216,9 @@ def compile_results(
         Returns:
             A dictionary of results.
         """
+        # TODO: figure out a universal compilation of results in the baseclass
         loss_numpy = convert_to_numpy(postprocessed_loss)
-        rewards = batch["rewards"]
-        rewards = convert_to_numpy(rewards)
-        return {
-            "avg_reward": rewards.mean(),
-            **loss_numpy,
-        }
+        return {"loss": loss_numpy}
 
     def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         """Perform an update on this Trainer.
@@ -239,6 +235,9 @@ def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
             return self.do_distributed_update(batch)
     
     def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
+        # TODO: remove the MultiAgentBatch from the type, it should be NestedDict from 
+        # the base class.
+        batch = self._convert_batch_type(batch)
         fwd_out = self._module.forward_train(batch)
         loss = self.compute_loss(fwd_out=fwd_out, batch=batch)
         gradients = self.compute_gradients(loss)
@@ -246,6 +245,12 @@ def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         self.apply_gradients(post_processed_gradients)
         return self.compile_results(batch, fwd_out, loss, post_processed_gradients)
 
+    def _convert_batch_type(self, batch):
+        # TODO: remove this method, it should be handled by the base class.
+        batch = NestedDict(batch.policy_batches)
+        batch = NestedDict({k: torch.as_tensor(v, dtype=torch.float32) for k, v in batch.items()})
+        return batch
+
     def additional_update(self, *args, **kwargs) -> Mapping[str, Any]:
         """Apply additional non-gradient based updates to this Trainer.
 
@@ -471,6 +476,7 @@ def get_parameters(self, module: RLModule) -> Sequence[ParamType]:
         Returns:
             The parameters of the module.
         """
+        # TODO: Make this method a classmethod
 
     @abc.abstractmethod
     def get_optimizer_obj(
diff --git a/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
new file mode 100644
index 000000000000..23ea34dac48e
--- /dev/null
+++ b/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
@@ -0,0 +1,163 @@
+import gymnasium as gym
+import unittest
+import torch
+import numpy as np
+
+import ray
+
+from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
+from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
+from ray.rllib.core.testing.torch.bc_rl_trainer import BCTorchRLTrainer
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
+from ray.rllib.utils.numpy import convert_to_numpy
+
+def get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
+    env = gym.make("CartPole-v1")
+    scaling_config = {} or scaling_config
+    distributed = False
+
+    # TODO: Another way to make RLTrainer would be to construct the module first
+    # and then apply trainer to it. We should also allow that. In fact if we figure
+    # out the serialization of RLModules we can simply pass the module the trainer
+    # and internally it will serialize and deserialize the module for distributed
+    # construction.
+    trainer = BCTorchRLTrainer(
+        module_class=DiscreteBCTorchModule,
+        module_kwargs={
+            "observation_space": env.observation_space,
+            "action_space": env.action_space,
+            "model_config": {"hidden_dim": 32},
+        },
+        scaling_config=scaling_config,
+        optimizer_config={"lr": 1e-3},
+        distributed=distributed,
+        in_test=True,
+    )
+
+    trainer.build()
+
+    return trainer
+
+
+class TestRLTrainer(unittest.TestCase):
+    @classmethod
+    def setUp(cls) -> None:
+        ray.init()
+
+    @classmethod
+    def tearDown(cls) -> None:
+        ray.shutdown()
+
+    def test_end_to_end_update(self):
+
+        trainer = get_trainer(scaling_config={"num_workers": 2})
+        reader = get_cartpole_dataset_reader(batch_size=512)
+
+        min_loss = float("inf")
+        for iter_i in range(1000):
+            batch = reader.next()
+            results = trainer.update(batch.as_multi_agent())
+
+            loss = results["loss"]["total_loss"]
+            min_loss = min(loss, min_loss)
+            print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}")
+            # The loss is initially around 0.69 (ln2). When it gets to around
+            # 0.57 the return of the policy gets to around 100.
+            if min_loss < 0.57:
+                break
+        self.assertLess(min_loss, 0.57)
+
+    def test_compute_gradients(self):
+        """Tests the compute_gradients correctness.
+
+        Tests that if we sum all the trainable variables the gradient of output w.r.t.
+        the weights is all ones.
+        """
+        trainer = get_trainer(scaling_config={"num_workers": 2})
+
+        params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
+        loss = {"total_loss": sum([param.sum() for param in params])}
+        gradients = trainer.compute_gradients(loss)
+
+        # type should be a mapping from ParamRefs to gradients
+        self.assertIsInstance(gradients, dict)
+
+        for grad in gradients.values():
+            check(grad, np.ones(grad.shape))
+
+    def test_apply_gradients(self):
+        """Tests the apply_gradients correctness.
+
+        Tests that if we apply gradients of all ones, the new params are equal to the
+        standard SGD/Adam update rule.
+        """
+
+        trainer = get_trainer(scaling_config={"num_workers": 2})
+
+        # calculated the expected new params based on gradients of all ones.
+        params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
+        n_steps = 100
+        expected = [
+            convert_to_numpy(param) - n_steps * trainer.optimizer_config["lr"] * np.ones(param.shape)
+            for param in params
+        ]
+        for _ in range(n_steps):
+            gradients = {trainer.get_param_ref(p): torch.ones_like(p) for p in params}
+            trainer.apply_gradients(gradients)
+
+        check(params, expected)
+
+    def test_add_remove_module(self):
+        """Tests the compute/apply_gradients with add/remove modules.
+
+        Tests that if we add a module with SGD optimizer with a known lr (different
+        from default), and remove the default module, with a loss that is the sum of
+        all variables the updated parameters follow the SGD update rule.
+        """
+        env = gym.make("CartPole-v1")
+        trainer = get_trainer(scaling_config={"num_workers": 2})
+
+        # add a test module with SGD optimizer with a known lr
+        lr = 1e-4
+
+        def set_optimizer_fn(module):
+            return [
+                (module.parameters(), torch.optim.Adam(module.parameters(), lr=lr))
+            ]
+
+        trainer.add_module(
+            module_id="test",
+            module_cls=DiscreteBCTorchModule,
+            module_kwargs={
+                "observation_space": env.observation_space,
+                "action_space": env.action_space,
+                # the hidden size is different than the default module
+                "model_config": {"hidden_dim": 16},
+            },
+            set_optimizer_fn=set_optimizer_fn,
+        )
+
+        trainer.remove_module(DEFAULT_POLICY_ID)
+
+        # only test module should be left
+        self.assertEqual(set(trainer.module.keys()), {"test"})
+
+        # calculated the expected new params based on gradients of all ones.
+        params = trainer.get_parameters(trainer.module["test"])
+        n_steps = 100
+        expected = [
+            convert_to_numpy(param) - n_steps * lr * np.ones(param.shape) for param in params]
+        for _ in range(n_steps):
+            loss = {"total_loss": sum([param.sum() for param in params])}
+            gradients = trainer.compute_gradients(loss)
+            trainer.apply_gradients(gradients)
+
+        check(params, expected)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index a9acc5af9768..870c5200266a 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -62,6 +62,7 @@ def __init__(
         self._world_size = scaling_config.get("num_workers", 1)
         self._gpu_id = gpu_ids[0] if gpu_ids else None
 
+    @property
     @override(RLTrainer)
     def module(self) -> MultiAgentRLModule:
         if self.distributed:
@@ -73,8 +74,8 @@ def configure_optimizers(self) -> ParamOptimizerPairs:
         lr = self.optimizer_config.get("lr", 1e-3)
         return [
             (
-                self._module[key].parmaeters(),
-                torch.optim.Adam(self._module[key].parmaeters(), lr=lr),
+                self.get_parameters(self._module[key]),
+                torch.optim.Adam(self.get_parameters(self._module[key]), lr=lr),
             )
             for key in self._module.keys()
         ]
@@ -84,7 +85,7 @@ def compute_gradients(self, loss: Union[TensorType, Mapping[str, Any]]) -> Param
         for optim in self._optim_to_param:
             optim.zero_grad(set_to_none=True)   
         loss[self.TOTAL_LOSS_KEY].backward()
-        grads = {pid: p.grad for pid, p in self._params}
+        grads = {pid: p.grad for pid, p in self._params.items()}
         return grads
 
     @override(RLTrainer)
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index 5f21b836fb85..4a8e300da1b7 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -1146,7 +1146,7 @@ def get_cartpole_dataset_reader(batch_size: int = 1) -> "DatasetReader":
         get_dataset_and_shards,
     )
 
-    path = "tests/data/cartpole/large.json"
+    path = "rllib/tests/data/cartpole/large.json"
     input_config = {"format": "json", "paths": path}
     dataset, _ = get_dataset_and_shards(
         AlgorithmConfig().offline_data(input_="dataset", input_config=input_config)

From ef1ffb854a4912322014b7a6e0e0a77d4f15da79 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 11 Jan 2023 22:16:15 -0800
Subject: [PATCH 009/112] lint

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           |  8 ++-
 .../rl_trainer/tests/test_torch_rl_trainer.py | 12 ++--
 .../core/rl_trainer/torch/torch_rl_trainer.py | 61 ++++++++-----------
 3 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 3ec5c4039394..2528de496be5 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -233,9 +233,9 @@ def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
             return self._update(batch)
         else:
             return self.do_distributed_update(batch)
-    
+
     def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
-        # TODO: remove the MultiAgentBatch from the type, it should be NestedDict from 
+        # TODO: remove the MultiAgentBatch from the type, it should be NestedDict from
         # the base class.
         batch = self._convert_batch_type(batch)
         fwd_out = self._module.forward_train(batch)
@@ -248,7 +248,9 @@ def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
     def _convert_batch_type(self, batch):
         # TODO: remove this method, it should be handled by the base class.
         batch = NestedDict(batch.policy_batches)
-        batch = NestedDict({k: torch.as_tensor(v, dtype=torch.float32) for k, v in batch.items()})
+        batch = NestedDict(
+            {k: torch.as_tensor(v, dtype=torch.float32) for k, v in batch.items()}
+        )
         return batch
 
     def additional_update(self, *args, **kwargs) -> Mapping[str, Any]:
diff --git a/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
index 23ea34dac48e..3a6c5d0e3af8 100644
--- a/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
@@ -12,6 +12,7 @@
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
 from ray.rllib.utils.numpy import convert_to_numpy
 
+
 def get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
     env = gym.make("CartPole-v1")
     scaling_config = {} or scaling_config
@@ -99,7 +100,8 @@ def test_apply_gradients(self):
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
         n_steps = 100
         expected = [
-            convert_to_numpy(param) - n_steps * trainer.optimizer_config["lr"] * np.ones(param.shape)
+            convert_to_numpy(param)
+            - n_steps * trainer.optimizer_config["lr"] * np.ones(param.shape)
             for param in params
         ]
         for _ in range(n_steps):
@@ -122,9 +124,7 @@ def test_add_remove_module(self):
         lr = 1e-4
 
         def set_optimizer_fn(module):
-            return [
-                (module.parameters(), torch.optim.Adam(module.parameters(), lr=lr))
-            ]
+            return [(module.parameters(), torch.optim.Adam(module.parameters(), lr=lr))]
 
         trainer.add_module(
             module_id="test",
@@ -147,7 +147,9 @@ def set_optimizer_fn(module):
         params = trainer.get_parameters(trainer.module["test"])
         n_steps = 100
         expected = [
-            convert_to_numpy(param) - n_steps * lr * np.ones(param.shape) for param in params]
+            convert_to_numpy(param) - n_steps * lr * np.ones(param.shape)
+            for param in params
+        ]
         for _ in range(n_steps):
             loss = {"total_loss": sum([param.sum() for param in params])}
             gradients = trainer.compute_gradients(loss)
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 870c5200266a..45e394274a7d 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -1,61 +1,51 @@
 import logging
-import numpy as np
 from typing import (
     Any,
     Mapping,
     Union,
     Type,
-    Optional,
-    Callable,
-    Dict,
     Sequence,
     Hashable,
 )
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+import ray
 
-from ray.rllib.utils.annotations import override
 from ray.rllib.core.rl_module import RLModule
 from ray.rllib.core.rl_trainer.rl_trainer import (
     RLTrainer,
     MultiAgentRLModule,
     ParamOptimizerPairs,
-    ParamRef,
     Optimizer,
     ParamType,
     ParamDictType,
 )
 from ray.rllib.policy.sample_batch import MultiAgentBatch
+from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
 
 
 logger = logging.getLogger(__name__)
 
-# TODO: Implement this
-import torch
-import torch.nn as nn
-from torch.nn.parallel import DistributedDataParallel as DDP
-import torch.distributed as dist
-import os
-import ray
-
 
 class TorchRLTrainer(RLTrainer):
-
     def __init__(
-        self, 
+        self,
         module_class: Union[Type[RLModule], Type[MultiAgentRLModule]],
-        module_kwargs: Mapping[str, Any], 
-        scaling_config: Mapping[str, Any], 
-        optimizer_config: Mapping[str, Any], 
-        distributed: bool = False, 
-        in_test: bool = False
+        module_kwargs: Mapping[str, Any],
+        scaling_config: Mapping[str, Any],
+        optimizer_config: Mapping[str, Any],
+        distributed: bool = False,
+        in_test: bool = False,
     ):
         super().__init__(
-            module_class=module_class, 
-            module_kwargs=module_kwargs, 
-            scaling_config=scaling_config, 
-            optimizer_config=optimizer_config, 
-            distributed=distributed, 
-            in_test=in_test
+            module_class=module_class,
+            module_kwargs=module_kwargs,
+            scaling_config=scaling_config,
+            optimizer_config=optimizer_config,
+            distributed=distributed,
+            in_test=in_test,
         )
 
         gpu_ids = ray.get_gpu_ids()
@@ -81,9 +71,11 @@ def configure_optimizers(self) -> ParamOptimizerPairs:
         ]
 
     @override(RLTrainer)
-    def compute_gradients(self, loss: Union[TensorType, Mapping[str, Any]]) -> ParamDictType:
+    def compute_gradients(
+        self, loss: Union[TensorType, Mapping[str, Any]]
+    ) -> ParamDictType:
         for optim in self._optim_to_param:
-            optim.zero_grad(set_to_none=True)   
+            optim.zero_grad(set_to_none=True)
         loss[self.TOTAL_LOSS_KEY].backward()
         grads = {pid: p.grad for pid, p in self._params.items()}
         return grads
@@ -93,16 +85,16 @@ def apply_gradients(self, gradients: ParamDictType) -> None:
 
         # make sure the parameters do not carry gradients on their own
         for optim in self._optim_to_param:
-            optim.zero_grad(set_to_none=True) 
-        
+            optim.zero_grad(set_to_none=True)
+
         # set the gradient of the parameters
         for pid, grad in gradients.items():
             self._params[pid].grad = grad
-        
+
         # for each optimizer call its step function with the gradients
         for optim in self._optim_to_param:
             optim.step()
-    
+
     @override(RLTrainer)
     def _make_distributed(self) -> MultiAgentRLModule:
         module = self._make_module()
@@ -113,7 +105,7 @@ def _make_distributed(self) -> MultiAgentRLModule:
         else:
             module = DDP(module, process_group=pg)
         return module
-    
+
     @override(RLTrainer)
     def do_distributed_update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         # in torch the distributed update is no different than the normal update
@@ -134,4 +126,3 @@ def get_optimizer_obj(
         # TODO: the abstraction should take in optimizer_config as a parameter as well.
         lr = self.optimizer_config.get("lr", 1e-3)
         return optimizer_cls(module.parameters, lr=lr)
-    

From 16f64f99936368bb13e6ee29e1da7edbcedcdbbb Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 13 Jan 2023 13:07:08 -0800
Subject: [PATCH 010/112] updated TODOs and BUILD

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD                                   |  8 ++++++++
 rllib/core/rl_trainer/rl_trainer.py           | 19 +++++++++----------
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  5 ++---
 .../core/rl_trainer/torch/torch_rl_trainer.py |  3 ++-
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index ca91ad831a34..f48e2fe6aca7 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1840,6 +1840,14 @@ py_test(
     srcs = ["core/rl_trainer/tests/test_rl_trainer.py"]
 )
 
+# TODO (Kourosh): to be removed in favor of test_rl_trainer.py
+py_test(
+    name = "test_torch_rl_trainer",
+    tags = ["team:rllib", "core"],
+    size = "medium",
+    srcs = ["core/rl_trainer/tests/torch/test_torch_rl_trainer.py"]
+)
+
 # --------------------------------------------------------------------
 # Models and Distributions
 # rllib/models/
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 2528de496be5..07cb1e8cb546 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -106,7 +106,7 @@ def __init__(
         distributed: bool = False,
         in_test: bool = False,
     ):
-        # TODO: convert scaling and optimizer configs to dataclasses
+        # TODO (Kourosh): convert scaling and optimizer configs to dataclasses
         self.module_class = module_class
         self.module_kwargs = module_kwargs
         self.scaling_config = scaling_config
@@ -173,10 +173,10 @@ def compute_loss(
             must contain one protected key "total_loss" which will be used for
             computing gradients through.
         """
-        # TODO: This method is built for multi-agent. While it is still possible to
-        # write single-agent losses, it may become confusing to users. We should find a
-        # way to allow them to specify single-agent losses as well, without having to
-        # think about one extra layer of hierarchy for module ids.
+        # TODO (Kourosh): This method is built for multi-agent. While it is still
+        # possible to write single-agent losses, it may become confusing to users. We
+        # should find a way to allow them to specify single-agent losses as well,
+        # without having to think about one extra layer of hierarchy for module ids.
 
     def on_after_compute_gradients(
         self, gradients_dict: Mapping[str, Any]
@@ -216,7 +216,7 @@ def compile_results(
         Returns:
             A dictionary of results.
         """
-        # TODO: figure out a universal compilation of results in the baseclass
+        # TODO (Kourosh): figure out a universal compilation of results in the baseclass
         loss_numpy = convert_to_numpy(postprocessed_loss)
         return {"loss": loss_numpy}
 
@@ -235,8 +235,8 @@ def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
             return self.do_distributed_update(batch)
 
     def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
-        # TODO: remove the MultiAgentBatch from the type, it should be NestedDict from
-        # the base class.
+        # TODO (Kourosh): remove the MultiAgentBatch from the type, it should be
+        # NestedDict from  the base class.
         batch = self._convert_batch_type(batch)
         fwd_out = self._module.forward_train(batch)
         loss = self.compute_loss(fwd_out=fwd_out, batch=batch)
@@ -246,7 +246,6 @@ def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         return self.compile_results(batch, fwd_out, loss, post_processed_gradients)
 
     def _convert_batch_type(self, batch):
-        # TODO: remove this method, it should be handled by the base class.
         batch = NestedDict(batch.policy_batches)
         batch = NestedDict(
             {k: torch.as_tensor(v, dtype=torch.float32) for k, v in batch.items()}
@@ -478,7 +477,7 @@ def get_parameters(self, module: RLModule) -> Sequence[ParamType]:
         Returns:
             The parameters of the module.
         """
-        # TODO: Make this method a classmethod
+        # TODO (Kourosh): Make this method a classmethod
 
     @abc.abstractmethod
     def get_optimizer_obj(
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index cd60feef576b..0729fde4c795 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -21,7 +21,6 @@
     ParamType,
     ParamDictType,
 )
-from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
 from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
@@ -164,8 +163,8 @@ def apply_gradients(self, gradients: Dict[ParamRef, TensorType]) -> None:
 
     @override(RLTrainer)
     def _make_distributed(self) -> MultiAgentRLModule:
-        # TODO: Does strategy has to be an attribute here? if so it's very hidden to
-        # the user of this class that there is such an attribute.
+        # TODO (Kourosh): Does strategy has to be an attribute here? if so it's very
+        # hidden to the user of this class that there is such an attribute.
 
         # TODO (Kourosh, Avnish): The optimizers still need to be created within
         # strategy.scope. Otherwise parameters of optimizers won't be properly
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 45e394274a7d..702f8dcf5758 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -123,6 +123,7 @@ def get_parameters(self, module: RLModule) -> Sequence[ParamType]:
     def get_optimizer_obj(
         self, module: RLModule, optimizer_cls: Type[Optimizer]
     ) -> Optimizer:
-        # TODO: the abstraction should take in optimizer_config as a parameter as well.
+        # TODO (Kourosh): the abstraction should take in optimizer_config as a
+        # parameter as well.
         lr = self.optimizer_config.get("lr", 1e-3)
         return optimizer_cls(module.parameters, lr=lr)

From 9719518238e7c8fa200bded898fd23172504cdbb Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 13 Jan 2023 14:57:22 -0800
Subject: [PATCH 011/112] wip: trainer_runner multi-gpu test

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/marl_module.py           |  16 ++
 rllib/core/rl_trainer/rl_trainer.py           |  43 +++--
 .../tests/test_trainer_runner_torch.py        | 153 ++++++++++++++++++
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  30 ----
 .../core/rl_trainer/torch/torch_rl_trainer.py |  39 ++++-
 5 files changed, 238 insertions(+), 43 deletions(-)
 create mode 100644 rllib/core/rl_trainer/tests/test_trainer_runner_torch.py

diff --git a/rllib/core/rl_module/marl_module.py b/rllib/core/rl_module/marl_module.py
index 70c4c65b3524..3c61e03b8703 100644
--- a/rllib/core/rl_module/marl_module.py
+++ b/rllib/core/rl_module/marl_module.py
@@ -242,6 +242,22 @@ def __getitem__(self, module_id: ModuleID) -> RLModule:
         self._check_module_exists(module_id)
         return self._rl_modules[module_id]
 
+    def __setitem__(self, module_id: ModuleID, module: RLModule) -> None:
+        """Modifies an existing module and assign it to the new module object.
+
+        Args:
+            module_id: The module ID to add.
+            module: The module to add.
+        """
+        try:
+            self._check_module_exists(module_id)
+        except ValueError:
+            raise ValueError(
+                f"Module ID {module_id} does not exist. Use add_module() to add a "
+                "new module."
+            )
+        self._rl_modules[module_id] = module
+
     @override(RLModule)
     def output_specs_train(self) -> SpecDict:
         return self._get_specs_for_modules("output_specs_train")
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 07cb1e8cb546..5b0f06a61e73 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -1,6 +1,8 @@
 import abc
 
 import logging
+import numpy as np
+import tree  # pip install dm-tree
 from typing import (
     Any,
     Callable,
@@ -216,9 +218,26 @@ def compile_results(
         Returns:
             A dictionary of results.
         """
-        # TODO (Kourosh): figure out a universal compilation of results in the baseclass
+
         loss_numpy = convert_to_numpy(postprocessed_loss)
-        return {"loss": loss_numpy}
+        batch = convert_to_numpy(batch)
+        breakpoint()
+        post_processed_gradients = convert_to_numpy(post_processed_gradients)
+        mean_grads = [np.mean(grad) for grad in tree.flatten(post_processed_gradients)]
+        ret = {
+            "loss": loss_numpy,
+            "mean_gradient": np.mean(mean_grads),
+        }
+
+        if self.in_test:
+            # this is to check if in the multi-gpu case, the weights across workers are
+            # the same. It is really only needed during testing.
+            mean_ws = {}
+            for module_id in self._module.keys():
+                m = self._module[module_id]
+                mean_ws[module_id] = np.mean([w.mean() for w in m.get_weights()])
+            ret["mean_weight"] = mean_ws
+        return ret
 
     def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         """Perform an update on this Trainer.
@@ -236,7 +255,7 @@ def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
 
     def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         # TODO (Kourosh): remove the MultiAgentBatch from the type, it should be
-        # NestedDict from  the base class.
+        # NestedDict from the base class.
         batch = self._convert_batch_type(batch)
         fwd_out = self._module.forward_train(batch)
         loss = self.compute_loss(fwd_out=fwd_out, batch=batch)
@@ -245,12 +264,18 @@ def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         self.apply_gradients(post_processed_gradients)
         return self.compile_results(batch, fwd_out, loss, post_processed_gradients)
 
-    def _convert_batch_type(self, batch):
-        batch = NestedDict(batch.policy_batches)
-        batch = NestedDict(
-            {k: torch.as_tensor(v, dtype=torch.float32) for k, v in batch.items()}
-        )
-        return batch
+    @abc.abstractmethod
+    def _convert_batch_type(self, batch: MultiAgentBatch) -> NestedDict[TensorType]:
+        """Converts a MultiAgentBatch to a NestedDict of Tensors.
+        
+        This should convert the input batch from a MultiAgentBatch format to framework specific tensor format located on the correct device.
+
+        Args:
+            batch: A MultiAgentBatch.
+
+        Returns:
+            A NestedDict.
+        """
 
     def additional_update(self, *args, **kwargs) -> Mapping[str, Any]:
         """Apply additional non-gradient based updates to this Trainer.
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py b/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
new file mode 100644
index 000000000000..0c6447547544
--- /dev/null
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
@@ -0,0 +1,153 @@
+import gymnasium as gym
+import unittest
+
+import tensorflow as tf
+import ray
+
+from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
+from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
+from ray.rllib.core.testing.torch.bc_rl_trainer import BCTorchRLTrainer
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
+from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
+
+
+class TestTrainerRunner(unittest.TestCase):
+    """This test is setup for 2 gpus."""
+
+    # TODO: Make a unittest that does not need 2 gpus to run.
+    # So that the user can run it locally as well.
+    @classmethod
+    def setUp(cls) -> None:
+        ray.init()
+
+    @classmethod
+    def tearDown(cls) -> None:
+        ray.shutdown()
+
+    def test_update_multigpu(self):
+        """Test training in a 2 gpu setup and that weights are synchronized."""
+        env = gym.make("CartPole-v1")
+        trainer_class = BCTorchRLTrainer
+        trainer_cfg = dict(
+            module_class=DiscreteBCTorchModule,
+            module_kwargs={
+                "observation_space": env.observation_space,
+                "action_space": env.action_space,
+                "model_config": {"hidden_dim": 32},
+            },
+            optimizer_config={"lr": 1e-3},
+            in_test=True,
+        )
+        runner = TrainerRunner(
+            trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
+        )
+
+        reader = get_cartpole_dataset_reader(batch_size=500)
+
+        min_loss = float("inf")
+        for iter_i in range(1000):
+            batch = reader.next()
+            results_worker_0, results_worker_1 = runner.update(batch.as_multi_agent())
+
+            loss = (
+                results_worker_0["loss"]["total_loss"]
+                + results_worker_1["loss"]["total_loss"]
+            ) / 2
+            min_loss = min(loss, min_loss)
+            print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}")
+            # The loss is initially around 0.69 (ln2). When it gets to around
+            # 0.57 the return of the policy gets to around 100.
+            if min_loss < 0.57:
+                break
+            self.assertEqual(
+                results_worker_0["mean_weight"]["default_policy"],
+                results_worker_1["mean_weight"]["default_policy"],
+            )
+        self.assertLess(min_loss, 0.57)
+
+    def test_add_remove_module(self):
+        env = gym.make("CartPole-v1")
+        trainer_class = BCTorchRLTrainer
+        trainer_cfg = dict(
+            module_class=DiscreteBCTorchModule,
+            module_kwargs={
+                "observation_space": env.observation_space,
+                "action_space": env.action_space,
+                "model_config": {"hidden_dim": 32},
+            },
+            optimizer_config={"lr": 1e-3},
+            in_test=True,
+        )
+        runner = TrainerRunner(
+            trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
+        )
+
+        reader = get_cartpole_dataset_reader(batch_size=500)
+        batch = reader.next()
+
+        # update once with the default policy
+        results = runner.update(batch.as_multi_agent())
+        module_ids_before_add = {DEFAULT_POLICY_ID}
+        new_module_id = "test_module"
+
+        # add a test_module
+        runner.add_module(
+            module_id=new_module_id,
+            module_cls=DiscreteBCTorchModule,
+            module_kwargs={
+                "observation_space": env.observation_space,
+                "action_space": env.action_space,
+                "model_config": {"hidden_dim": 32},
+            },
+            optimizer_cls=tf.keras.optimizers.Adam,
+        )
+
+        # do training that includes the test_module
+        results = runner.update(
+            MultiAgentBatch(
+                {new_module_id: batch, DEFAULT_POLICY_ID: batch}, batch.count
+            )
+        )
+
+        # check that module weights are updated across workers and synchronized
+        for i in range(1, len(results)):
+            for module_id in results[i]["mean_weight"].keys():
+                assert (
+                    results[i]["mean_weight"][module_id]
+                    == results[i - 1]["mean_weight"][module_id]
+                )
+
+        # check that module ids are updated to include the new module
+        module_ids_after_add = {DEFAULT_POLICY_ID, new_module_id}
+        for result in results:
+            # remove the total_loss key since its not a module key
+            self.assertEqual(set(result["loss"]) - {"total_loss"}, module_ids_after_add)
+
+        # remove the test_module
+        runner.remove_module(module_id=new_module_id)
+
+        # run training without the test_module
+        results = runner.update(batch.as_multi_agent())
+
+        # check that module weights are updated across workers and synchronized
+        for i in range(1, len(results)):
+            for module_id in results[i]["mean_weight"].keys():
+                assert (
+                    results[i]["mean_weight"][module_id]
+                    == results[i - 1]["mean_weight"][module_id]
+                )
+
+        # check that module ids are updated after remove operation to not
+        # include the new module
+        for result in results:
+            # remove the total_loss key since its not a module key
+            self.assertEqual(
+                set(result["loss"]) - {"total_loss"}, module_ids_before_add
+            )
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 0729fde4c795..ffb448b7379a 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -1,5 +1,4 @@
 import logging
-import numpy as np
 from typing import (
     Any,
     Mapping,
@@ -27,8 +26,6 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
-from ray.rllib.utils.numpy import convert_to_numpy
-import tree  # pip install dm-tree
 
 tf1, tf, tfv = try_import_tf()
 tf1.enable_eager_execution()
@@ -174,33 +171,6 @@ def _make_distributed(self) -> MultiAgentRLModule:
             module = self._make_module()
         return module
 
-    @override(RLTrainer)
-    def compile_results(
-        self,
-        batch: NestedDict,
-        fwd_out: Mapping[str, Any],
-        postprocessed_loss: Mapping[str, Any],
-        post_processed_gradients: Mapping[str, Any],
-    ) -> Mapping[str, Any]:
-        loss_numpy = convert_to_numpy(postprocessed_loss)
-        batch = convert_to_numpy(batch)
-        post_processed_gradients = convert_to_numpy(post_processed_gradients)
-        mean_grads = [grad.mean() for grad in tree.flatten(post_processed_gradients)]
-        ret = {
-            "loss": loss_numpy,
-            "mean_gradient": np.mean(mean_grads),
-        }
-
-        if self.in_test:
-            # this is to check if in the multi-gpu case, the weights across workers are
-            # the same. It is really only needed during testing.
-            mean_ws = {}
-            for module_id in self._module.keys():
-                m = self._module[module_id]
-                mean_ws[module_id] = np.mean([w.mean() for w in m.get_weights()])
-            ret["mean_weight"] = mean_ws
-        return ret
-
     @override(RLTrainer)
     def add_module(
         self,
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 702f8dcf5758..c5732d3ca92d 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -24,6 +24,7 @@
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
+from ray.rllib.utils.nested_dict import NestedDict
 
 
 logger = logging.getLogger(__name__)
@@ -49,8 +50,9 @@ def __init__(
         )
 
         gpu_ids = ray.get_gpu_ids()
-        self._world_size = scaling_config.get("num_workers", 1)
+        self._world_size = scaling_config.get("num_workers", 2)
         self._gpu_id = gpu_ids[0] if gpu_ids else None
+        self._device = torch.device(self._gpu_id if gpu_ids else "cpu")
 
     @property
     @override(RLTrainer)
@@ -99,13 +101,42 @@ def apply_gradients(self, gradients: ParamDictType) -> None:
     def _make_distributed(self) -> MultiAgentRLModule:
         module = self._make_module()
         pg = torch.distributed.new_group(list(range(self._world_size)))
+
+        class DDPRLModuleWrapper(DDP):
+            def forward_train(self, *args, **kwargs):
+                return self.module.forward_train(*args, **kwargs)
+        
         if self._gpu_id is not None:
-            module.to(self._gpu_id)
-            module = DDP(module, device_ids=[self._gpu_id], process_group=pg)
+
+            # if the module is a MultiAgentRLModule and nn.Module we can simply assume 
+            # all the submodules are registered. Otherwise, we need to loop through 
+            # each submodule and move it to the correct device. 
+            # TODO (Kourosh): This can result in missing modules if the user does not 
+            # register them in the MultiAgentRLModule. We should find a better way to 
+            # handle this.
+            if isinstance(module, torch.nn.Module):
+                module.to(self._device)
+                module = DDPRLModuleWrapper(module, device_ids=[self._gpu_id], process_group=pg)
+            else:
+                for key in module.keys():
+                    module[key].to(self._device)
+                    module[key] = DDPRLModuleWrapper(module[key], device_ids=[self._gpu_id], process_group=pg)
         else:
-            module = DDP(module, process_group=pg)
+            if isinstance(module, torch.nn.Module):
+                module = DDPRLModuleWrapper(module, process_group=pg)
+            else:
+                for key in module.keys():
+                    module[key] = DDPRLModuleWrapper(module[key], process_group=pg)
         return module
 
+    @override(RLTrainer)
+    def _convert_batch_type(self, batch: MultiAgentBatch):
+        batch = NestedDict(batch.policy_batches)
+        batch = NestedDict(
+            {k: torch.as_tensor(v, dtype=torch.float32, device=self._device) for k, v in batch.items()}
+        )
+        return batch
+
     @override(RLTrainer)
     def do_distributed_update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         # in torch the distributed update is no different than the normal update

From 77730d804e8b042fc9ba4722904fecd7731f7f3d Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 13 Jan 2023 16:15:46 -0800
Subject: [PATCH 012/112] torch version runs but the parameters are not synced

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           | 21 ++++++++-----
 .../tests/test_trainer_runner_torch.py        |  2 +-
 .../core/rl_trainer/torch/torch_rl_trainer.py | 31 +++++++++++++------
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 5b0f06a61e73..58b6bd6b9fab 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -2,7 +2,6 @@
 
 import logging
 import numpy as np
-import tree  # pip install dm-tree
 from typing import (
     Any,
     Callable,
@@ -218,12 +217,16 @@ def compile_results(
         Returns:
             A dictionary of results.
         """
-
+        # TODO (Kourosh): This method assumes that all the modules with in the
+        # marl_module are accessible via looping through it rl_modules. This may not be
+        # true for centralized critic for example. Therefore we need a better
+        # generalization of this base-class implementation.
         loss_numpy = convert_to_numpy(postprocessed_loss)
         batch = convert_to_numpy(batch)
-        breakpoint()
-        post_processed_gradients = convert_to_numpy(post_processed_gradients)
-        mean_grads = [np.mean(grad) for grad in tree.flatten(post_processed_gradients)]
+        mean_grads = [
+            np.mean(grad)
+            for grad in convert_to_numpy(post_processed_gradients.values())
+        ]
         ret = {
             "loss": loss_numpy,
             "mean_gradient": np.mean(mean_grads),
@@ -235,7 +238,8 @@ def compile_results(
             mean_ws = {}
             for module_id in self._module.keys():
                 m = self._module[module_id]
-                mean_ws[module_id] = np.mean([w.mean() for w in m.get_weights()])
+                parameters = convert_to_numpy(self.get_parameters(m))
+                mean_ws[module_id] = np.mean([w.mean() for w in parameters])
             ret["mean_weight"] = mean_ws
         return ret
 
@@ -267,8 +271,9 @@ def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
     @abc.abstractmethod
     def _convert_batch_type(self, batch: MultiAgentBatch) -> NestedDict[TensorType]:
         """Converts a MultiAgentBatch to a NestedDict of Tensors.
-        
-        This should convert the input batch from a MultiAgentBatch format to framework specific tensor format located on the correct device.
+
+        This should convert the input batch from a MultiAgentBatch format to framework 
+        specific tensor format located on the correct device.
 
         Args:
             batch: A MultiAgentBatch.
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py b/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
index 0c6447547544..ce6caafe874a 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
@@ -35,7 +35,7 @@ def test_update_multigpu(self):
                 "action_space": env.action_space,
                 "model_config": {"hidden_dim": 32},
             },
-            optimizer_config={"lr": 1e-3},
+            optimizer_config={"lr": 1e-1},
             in_test=True,
         )
         runner = TrainerRunner(
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index c5732d3ca92d..e893b35dbcea 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -105,22 +105,32 @@ def _make_distributed(self) -> MultiAgentRLModule:
         class DDPRLModuleWrapper(DDP):
             def forward_train(self, *args, **kwargs):
                 return self.module.forward_train(*args, **kwargs)
-        
+
+            def get_weights(self, *args, **kwargs):
+                return self.module.get_weights(*args, **kwargs)
+
+            def set_weights(self, *args, **kwargs):
+                self.module.set_weights(*args, **kwargs)
+
         if self._gpu_id is not None:
 
-            # if the module is a MultiAgentRLModule and nn.Module we can simply assume 
-            # all the submodules are registered. Otherwise, we need to loop through 
-            # each submodule and move it to the correct device. 
-            # TODO (Kourosh): This can result in missing modules if the user does not 
-            # register them in the MultiAgentRLModule. We should find a better way to 
+            # if the module is a MultiAgentRLModule and nn.Module we can simply assume
+            # all the submodules are registered. Otherwise, we need to loop through
+            # each submodule and move it to the correct device.
+            # TODO (Kourosh): This can result in missing modules if the user does not
+            # register them in the MultiAgentRLModule. We should find a better way to
             # handle this.
             if isinstance(module, torch.nn.Module):
                 module.to(self._device)
-                module = DDPRLModuleWrapper(module, device_ids=[self._gpu_id], process_group=pg)
+                module = DDPRLModuleWrapper(
+                    module, device_ids=[self._gpu_id], process_group=pg
+                )
             else:
                 for key in module.keys():
                     module[key].to(self._device)
-                    module[key] = DDPRLModuleWrapper(module[key], device_ids=[self._gpu_id], process_group=pg)
+                    module[key] = DDPRLModuleWrapper(
+                        module[key], device_ids=[self._gpu_id], process_group=pg
+                    )
         else:
             if isinstance(module, torch.nn.Module):
                 module = DDPRLModuleWrapper(module, process_group=pg)
@@ -133,7 +143,10 @@ def forward_train(self, *args, **kwargs):
     def _convert_batch_type(self, batch: MultiAgentBatch):
         batch = NestedDict(batch.policy_batches)
         batch = NestedDict(
-            {k: torch.as_tensor(v, dtype=torch.float32, device=self._device) for k, v in batch.items()}
+            {
+                k: torch.as_tensor(v, dtype=torch.float32, device=self._device)
+                for k, v in batch.items()
+            }
         )
         return batch
 

From 97573dc54858d0c5ba4e49f83e0a0c1a19af1184 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 13 Jan 2023 23:25:22 -0800
Subject: [PATCH 013/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           |  2 +-
 .../core/rl_trainer/torch/torch_rl_trainer.py | 58 +++++++++----------
 rllib/core/rl_trainer/trainer_runner.py       | 11 ++--
 3 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 58b6bd6b9fab..07846eb05866 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -272,7 +272,7 @@ def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
     def _convert_batch_type(self, batch: MultiAgentBatch) -> NestedDict[TensorType]:
         """Converts a MultiAgentBatch to a NestedDict of Tensors.
 
-        This should convert the input batch from a MultiAgentBatch format to framework 
+        This should convert the input batch from a MultiAgentBatch format to framework
         specific tensor format located on the correct device.
 
         Args:
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index e893b35dbcea..1578ddbca2dd 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -10,8 +10,6 @@
 import torch
 from torch.nn.parallel import DistributedDataParallel as DDP
 
-import ray
-
 from ray.rllib.core.rl_module import RLModule
 from ray.rllib.core.rl_trainer.rl_trainer import (
     RLTrainer,
@@ -26,7 +24,6 @@
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -49,10 +46,12 @@ def __init__(
             in_test=in_test,
         )
 
-        gpu_ids = ray.get_gpu_ids()
-        self._world_size = scaling_config.get("num_workers", 2)
-        self._gpu_id = gpu_ids[0] if gpu_ids else None
-        self._device = torch.device(self._gpu_id if gpu_ids else "cpu")
+        self._world_size = scaling_config.get("num_workers", 1)
+        self._use_gpu = scaling_config.get("use_gpu", False)
+        # TODO (Kourosh): This RLTrainer assumes that each actor is a single GPU actor
+        # at most. If GPU is not enabled it is a CPU actor. ray.gpu_ids() will return
+        # the accessible devices. So we can ideally do model parallelism as well
+        self._device = torch.device("cuda" if self._use_gpu else "cpu")
 
     @property
     @override(RLTrainer)
@@ -100,7 +99,7 @@ def apply_gradients(self, gradients: ParamDictType) -> None:
     @override(RLTrainer)
     def _make_distributed(self) -> MultiAgentRLModule:
         module = self._make_module()
-        pg = torch.distributed.new_group(list(range(self._world_size)))
+        # pg = torch.distributed.new_group(list(range(self._world_size)))
 
         class DDPRLModuleWrapper(DDP):
             def forward_train(self, *args, **kwargs):
@@ -112,31 +111,26 @@ def get_weights(self, *args, **kwargs):
             def set_weights(self, *args, **kwargs):
                 self.module.set_weights(*args, **kwargs)
 
-        if self._gpu_id is not None:
-
-            # if the module is a MultiAgentRLModule and nn.Module we can simply assume
-            # all the submodules are registered. Otherwise, we need to loop through
-            # each submodule and move it to the correct device.
-            # TODO (Kourosh): This can result in missing modules if the user does not
-            # register them in the MultiAgentRLModule. We should find a better way to
-            # handle this.
-            if isinstance(module, torch.nn.Module):
-                module.to(self._device)
-                module = DDPRLModuleWrapper(
-                    module, device_ids=[self._gpu_id], process_group=pg
-                )
-            else:
-                for key in module.keys():
-                    module[key].to(self._device)
-                    module[key] = DDPRLModuleWrapper(
-                        module[key], device_ids=[self._gpu_id], process_group=pg
-                    )
+        # if self._use_gpu:
+        # if the module is a MultiAgentRLModule and nn.Module we can simply assume
+        # all the submodules are registered. Otherwise, we need to loop through
+        # each submodule and move it to the correct device.
+        # TODO (Kourosh): This can result in missing modules if the user does not
+        # register them in the MultiAgentRLModule. We should find a better way to
+        # handle this.
+        print(f"device = {self._device}")
+        if isinstance(module, torch.nn.Module):
+            module.to(self._device)
+            module = DDPRLModuleWrapper(
+                module  # , device_ids=[self._device]# , process_group=pg
+            )
         else:
-            if isinstance(module, torch.nn.Module):
-                module = DDPRLModuleWrapper(module, process_group=pg)
-            else:
-                for key in module.keys():
-                    module[key] = DDPRLModuleWrapper(module[key], process_group=pg)
+            for key in module.keys():
+                module[key].to(self._device)
+                module[key] = DDPRLModuleWrapper(
+                    module[key]  # , device_ids=[self._device]# , process_group=pg
+                )
+
         return module
 
     @override(RLTrainer)
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 3a0717025e4a..a7a66bb13be9 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -61,7 +61,7 @@ def __init__(
         if issubclass(trainer_class, TorchRLTrainer):
             from ray.train.torch import TorchConfig
 
-            backend_config = TorchConfig()
+            backend_config = TorchConfig(backend="gloo")
         elif issubclass(trainer_class, TfRLTrainer):
             from ray.train.tensorflow import TensorflowConfig
 
@@ -78,10 +78,11 @@ def __init__(
             # with default 0
         )
 
-        # TODO: let's not pass this into the config which will cause
+        # TODO (Kourosh): let's not pass this into the config which will cause
         # information leakage into the SARLTrainer about other workers.
-        scaling_config = {"world_size": resources["num_workers"]}
-        trainer_config["scaling_config"] = scaling_config
+        # TODO (Kourosh): Scaling config right now goes through a weird path to get
+        # computed.
+        trainer_config["scaling_config"] = resources
         trainer_config["distributed"] = bool(self._compute_config["num_gpus"] > 1)
         self.backend_executor.start(
             train_cls=trainer_class, train_cls_kwargs=trainer_config
@@ -105,7 +106,7 @@ def _compute_necessary_resources(self):
         elif not num_gpus and not num_workers:
             num_workers = 1
 
-        return {"num_workers": num_workers, "use_gpu": bool(num_gpus)}
+        return {"num_workers": num_workers, "use_gpu": num_gpus > 0}
 
     def update(self, batch: MultiAgentBatch = None, **kwargs):
         """

From 091c4067648cee5d9d54b3f58cd70b7ae3a53bda Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 17 Jan 2023 11:41:35 -0800
Subject: [PATCH 014/112] got the multi-gpu gradient sync up working

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../tests/test_trainer_runner_torch.py        |  8 +++----
 .../core/rl_trainer/torch/torch_rl_trainer.py | 23 +++++++++++--------
 rllib/core/rl_trainer/trainer_runner.py       |  4 ++--
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py b/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
index ce6caafe874a..121dbf7452c6 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
@@ -1,7 +1,7 @@
 import gymnasium as gym
 import unittest
 
-import tensorflow as tf
+import torch
 import ray
 
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
@@ -35,7 +35,7 @@ def test_update_multigpu(self):
                 "action_space": env.action_space,
                 "model_config": {"hidden_dim": 32},
             },
-            optimizer_config={"lr": 1e-1},
+            optimizer_config={"lr": 0.1},
             in_test=True,
         )
         runner = TrainerRunner(
@@ -75,7 +75,7 @@ def test_add_remove_module(self):
                 "action_space": env.action_space,
                 "model_config": {"hidden_dim": 32},
             },
-            optimizer_config={"lr": 1e-3},
+            optimizer_config={"lr": 0.1},
             in_test=True,
         )
         runner = TrainerRunner(
@@ -99,7 +99,7 @@ def test_add_remove_module(self):
                 "action_space": env.action_space,
                 "model_config": {"hidden_dim": 32},
             },
-            optimizer_cls=tf.keras.optimizers.Adam,
+            optimizer_cls=torch.optim.Adam,
         )
 
         # do training that includes the test_module
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 1578ddbca2dd..47bc514be722 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -10,6 +10,8 @@
 import torch
 from torch.nn.parallel import DistributedDataParallel as DDP
 
+from ray.train.torch.train_loop_utils import _TorchAccelerator
+
 from ray.rllib.core.rl_module import RLModule
 from ray.rllib.core.rl_trainer.rl_trainer import (
     RLTrainer,
@@ -48,11 +50,7 @@ def __init__(
 
         self._world_size = scaling_config.get("num_workers", 1)
         self._use_gpu = scaling_config.get("use_gpu", False)
-        # TODO (Kourosh): This RLTrainer assumes that each actor is a single GPU actor
-        # at most. If GPU is not enabled it is a CPU actor. ray.gpu_ids() will return
-        # the accessible devices. So we can ideally do model parallelism as well
-        self._device = torch.device("cuda" if self._use_gpu else "cpu")
-
+        
     @property
     @override(RLTrainer)
     def module(self) -> MultiAgentRLModule:
@@ -79,6 +77,7 @@ def compute_gradients(
             optim.zero_grad(set_to_none=True)
         loss[self.TOTAL_LOSS_KEY].backward()
         grads = {pid: p.grad for pid, p in self._params.items()}
+
         return grads
 
     @override(RLTrainer)
@@ -96,14 +95,20 @@ def apply_gradients(self, gradients: ParamDictType) -> None:
         for optim in self._optim_to_param:
             optim.step()
 
+
+
     @override(RLTrainer)
     def _make_distributed(self) -> MultiAgentRLModule:
         module = self._make_module()
-        # pg = torch.distributed.new_group(list(range(self._world_size)))
+
+        # TODO (Kourosh): How do we handle model parallism?
+        # TODO (Kourosh): Instead of using _TorchAccelerator, we should use the public 
+        # api in ray.train but allow for session to be None without any errors raised. 
+        self._device = _TorchAccelerator().get_device()
 
         class DDPRLModuleWrapper(DDP):
             def forward_train(self, *args, **kwargs):
-                return self.module.forward_train(*args, **kwargs)
+                return self(*args, **kwargs)
 
             def get_weights(self, *args, **kwargs):
                 return self.module.get_weights(*args, **kwargs)
@@ -111,14 +116,12 @@ def get_weights(self, *args, **kwargs):
             def set_weights(self, *args, **kwargs):
                 self.module.set_weights(*args, **kwargs)
 
-        # if self._use_gpu:
         # if the module is a MultiAgentRLModule and nn.Module we can simply assume
         # all the submodules are registered. Otherwise, we need to loop through
         # each submodule and move it to the correct device.
         # TODO (Kourosh): This can result in missing modules if the user does not
         # register them in the MultiAgentRLModule. We should find a better way to
         # handle this.
-        print(f"device = {self._device}")
         if isinstance(module, torch.nn.Module):
             module.to(self._device)
             module = DDPRLModuleWrapper(
@@ -128,7 +131,7 @@ def set_weights(self, *args, **kwargs):
             for key in module.keys():
                 module[key].to(self._device)
                 module[key] = DDPRLModuleWrapper(
-                    module[key]  # , device_ids=[self._device]# , process_group=pg
+                    module[key], #device_ids=[self._device]# , process_group=pg
                 )
 
         return module
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index a7a66bb13be9..947e0d8eb6de 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -60,8 +60,8 @@ def __init__(
         # the only part of this class that is framework agnostic:
         if issubclass(trainer_class, TorchRLTrainer):
             from ray.train.torch import TorchConfig
-
-            backend_config = TorchConfig(backend="gloo")
+        
+            backend_config = TorchConfig()
         elif issubclass(trainer_class, TfRLTrainer):
             from ray.train.tensorflow import TensorflowConfig
 

From a42b0f1e6a0b92dde07b5526d9f41cb22b6711e3 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 17 Jan 2023 14:21:50 -0800
Subject: [PATCH 015/112] fixed add/remove multi-gpu tests

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           |  1 +
 .../core/rl_trainer/torch/torch_rl_trainer.py | 84 ++++++++++++++-----
 2 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 07846eb05866..58c29c336415 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -454,6 +454,7 @@ def build(self) -> None:
                 self._optim_to_param[optimizer].append(param_ref)
                 self._params[param_ref] = param
                 self._param_to_optim[param_ref] = optimizer
+        
 
     def do_distributed_update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         """Perform a distributed update on this Trainer.
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 47bc514be722..33f6919d93ad 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -6,13 +6,15 @@
     Type,
     Sequence,
     Hashable,
+    Optional,
+    Callable,
 )
 import torch
 from torch.nn.parallel import DistributedDataParallel as DDP
 
 from ray.train.torch.train_loop_utils import _TorchAccelerator
 
-from ray.rllib.core.rl_module import RLModule
+from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
 from ray.rllib.core.rl_trainer.rl_trainer import (
     RLTrainer,
     MultiAgentRLModule,
@@ -29,6 +31,41 @@
 logger = logging.getLogger(__name__)
 
 
+
+class DDPRLModuleWrapper(DDP, RLModule):
+
+    @override(RLModule)
+    def _forward_train(self, *args, **kwargs):
+        return self(*args, **kwargs)
+
+    @override(RLModule)
+    def _forward_inference(self, *args, **kwargs) -> Mapping[str, Any]:
+        return self.module._forward_inference(*args, **kwargs)
+    
+    @override(RLModule)
+    def _forward_exploration(self, *args, **kwargs) -> Mapping[str, Any]:
+        return self.module._forward_exploration(*args, **kwargs)
+
+    @override(RLModule)
+    def get_state(self, *args, **kwargs):
+        return self.module.get_state(*args, **kwargs)
+
+    @override(RLModule)
+    def set_state(self, *args, **kwargs):
+        self.module.set_state(*args, **kwargs)
+    
+    @override(RLModule)
+    def make_distributed(self, dist_config: Mapping[str, Any] = None) -> None:
+        # TODO (Kourosh): Not to sure about this make_distributed api belonging to 
+        # RLModule or not? we should see if we use this api end-point for both tf and 
+        # torch instead of doing it in the trainer.
+        pass
+
+    @override(RLModule)
+    def is_distributed(self) -> bool:
+        return True
+
+
 class TorchRLTrainer(RLTrainer):
     def __init__(
         self,
@@ -54,8 +91,6 @@ def __init__(
     @property
     @override(RLTrainer)
     def module(self) -> MultiAgentRLModule:
-        if self.distributed:
-            return self._module.module
         return self._module
 
     @override(RLTrainer)
@@ -106,16 +141,6 @@ def _make_distributed(self) -> MultiAgentRLModule:
         # api in ray.train but allow for session to be None without any errors raised. 
         self._device = _TorchAccelerator().get_device()
 
-        class DDPRLModuleWrapper(DDP):
-            def forward_train(self, *args, **kwargs):
-                return self(*args, **kwargs)
-
-            def get_weights(self, *args, **kwargs):
-                return self.module.get_weights(*args, **kwargs)
-
-            def set_weights(self, *args, **kwargs):
-                self.module.set_weights(*args, **kwargs)
-
         # if the module is a MultiAgentRLModule and nn.Module we can simply assume
         # all the submodules are registered. Otherwise, we need to loop through
         # each submodule and move it to the correct device.
@@ -124,15 +149,11 @@ def set_weights(self, *args, **kwargs):
         # handle this.
         if isinstance(module, torch.nn.Module):
             module.to(self._device)
-            module = DDPRLModuleWrapper(
-                module  # , device_ids=[self._device]# , process_group=pg
-            )
+            module = DDPRLModuleWrapper(module)
         else:
             for key in module.keys():
                 module[key].to(self._device)
-                module[key] = DDPRLModuleWrapper(
-                    module[key], #device_ids=[self._device]# , process_group=pg
-                )
+                module[key] = DDPRLModuleWrapper(module[key])
 
         return module
 
@@ -167,4 +188,27 @@ def get_optimizer_obj(
         # TODO (Kourosh): the abstraction should take in optimizer_config as a
         # parameter as well.
         lr = self.optimizer_config.get("lr", 1e-3)
-        return optimizer_cls(module.parameters, lr=lr)
+        return optimizer_cls(module.parameters(), lr=lr)
+
+
+    @override(RLTrainer)
+    def add_module(
+        self,
+        *,
+        module_id: ModuleID,
+        module_cls: Type[RLModule],
+        module_kwargs: Mapping[str, Any],
+        set_optimizer_fn: Optional[Callable[[RLModule], ParamOptimizerPairs]] = None,
+        optimizer_cls: Optional[Type[Optimizer]] = None,
+    ) -> None:
+        super().add_module(
+            module_id=module_id,
+            module_cls=module_cls,
+            module_kwargs=module_kwargs,
+            set_optimizer_fn=set_optimizer_fn,
+            optimizer_cls=optimizer_cls,
+        )
+
+        # we need to ddpify the module that was just added to the pool
+        self._module[module_id].to(self._device)
+        self._module[module_id] = DDPRLModuleWrapper(self._module[module_id])

From 62fe11f682f183f92492133b3cbf04ce2250427b Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 17 Jan 2023 14:27:04 -0800
Subject: [PATCH 016/112] moved the DDPRLModuleWrapper outside of RLTrainer +
 lint

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/torch/torch_rl_module.py | 39 ++++++++++++++++
 rllib/core/rl_trainer/rl_trainer.py           |  1 -
 .../core/rl_trainer/torch/torch_rl_trainer.py | 46 ++-----------------
 rllib/core/rl_trainer/trainer_runner.py       |  2 +-
 4 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py
index 52a531379040..776cf42df480 100644
--- a/rllib/core/rl_module/torch/torch_rl_module.py
+++ b/rllib/core/rl_module/torch/torch_rl_module.py
@@ -4,6 +4,12 @@
 from ray.rllib.core.rl_module import RLModule
 
 torch, nn = try_import_torch()
+if torch:
+    from torch.nn.parallel import DistributedDataParallel as DDP
+else:
+    raise RuntimeError(
+        "Torch is not installed. Please install torch or do pip install ray[rllib]."
+    )
 
 
 class TorchRLModule(nn.Module, RLModule):
@@ -39,3 +45,36 @@ def is_distributed(self) -> bool:
         """Returns True if the module is distributed."""
         # TODO (Avnish): Implement this.
         return False
+
+
+class DDPRLModuleWrapper(DDP, RLModule):
+    @override(RLModule)
+    def _forward_train(self, *args, **kwargs):
+        return self(*args, **kwargs)
+
+    @override(RLModule)
+    def _forward_inference(self, *args, **kwargs) -> Mapping[str, Any]:
+        return self.module._forward_inference(*args, **kwargs)
+
+    @override(RLModule)
+    def _forward_exploration(self, *args, **kwargs) -> Mapping[str, Any]:
+        return self.module._forward_exploration(*args, **kwargs)
+
+    @override(RLModule)
+    def get_state(self, *args, **kwargs):
+        return self.module.get_state(*args, **kwargs)
+
+    @override(RLModule)
+    def set_state(self, *args, **kwargs):
+        self.module.set_state(*args, **kwargs)
+
+    @override(RLModule)
+    def make_distributed(self, dist_config: Mapping[str, Any] = None) -> None:
+        # TODO (Kourosh): Not to sure about this make_distributed api belonging to
+        # RLModule or not? we should see if we use this api end-point for both tf and
+        # torch instead of doing it in the trainer.
+        pass
+
+    @override(RLModule)
+    def is_distributed(self) -> bool:
+        return True
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 58c29c336415..07846eb05866 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -454,7 +454,6 @@ def build(self) -> None:
                 self._optim_to_param[optimizer].append(param_ref)
                 self._params[param_ref] = param
                 self._param_to_optim[param_ref] = optimizer
-        
 
     def do_distributed_update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         """Perform a distributed update on this Trainer.
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 33f6919d93ad..50ca2a1aa434 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -10,7 +10,6 @@
     Callable,
 )
 import torch
-from torch.nn.parallel import DistributedDataParallel as DDP
 
 from ray.train.torch.train_loop_utils import _TorchAccelerator
 
@@ -23,6 +22,7 @@
     ParamType,
     ParamDictType,
 )
+from ray.rllib.core.rl_module.torch.torch_rl_module import DDPRLModuleWrapper
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
@@ -31,41 +31,6 @@
 logger = logging.getLogger(__name__)
 
 
-
-class DDPRLModuleWrapper(DDP, RLModule):
-
-    @override(RLModule)
-    def _forward_train(self, *args, **kwargs):
-        return self(*args, **kwargs)
-
-    @override(RLModule)
-    def _forward_inference(self, *args, **kwargs) -> Mapping[str, Any]:
-        return self.module._forward_inference(*args, **kwargs)
-    
-    @override(RLModule)
-    def _forward_exploration(self, *args, **kwargs) -> Mapping[str, Any]:
-        return self.module._forward_exploration(*args, **kwargs)
-
-    @override(RLModule)
-    def get_state(self, *args, **kwargs):
-        return self.module.get_state(*args, **kwargs)
-
-    @override(RLModule)
-    def set_state(self, *args, **kwargs):
-        self.module.set_state(*args, **kwargs)
-    
-    @override(RLModule)
-    def make_distributed(self, dist_config: Mapping[str, Any] = None) -> None:
-        # TODO (Kourosh): Not to sure about this make_distributed api belonging to 
-        # RLModule or not? we should see if we use this api end-point for both tf and 
-        # torch instead of doing it in the trainer.
-        pass
-
-    @override(RLModule)
-    def is_distributed(self) -> bool:
-        return True
-
-
 class TorchRLTrainer(RLTrainer):
     def __init__(
         self,
@@ -87,7 +52,7 @@ def __init__(
 
         self._world_size = scaling_config.get("num_workers", 1)
         self._use_gpu = scaling_config.get("use_gpu", False)
-        
+
     @property
     @override(RLTrainer)
     def module(self) -> MultiAgentRLModule:
@@ -130,15 +95,13 @@ def apply_gradients(self, gradients: ParamDictType) -> None:
         for optim in self._optim_to_param:
             optim.step()
 
-
-
     @override(RLTrainer)
     def _make_distributed(self) -> MultiAgentRLModule:
         module = self._make_module()
 
         # TODO (Kourosh): How do we handle model parallism?
-        # TODO (Kourosh): Instead of using _TorchAccelerator, we should use the public 
-        # api in ray.train but allow for session to be None without any errors raised. 
+        # TODO (Kourosh): Instead of using _TorchAccelerator, we should use the public
+        # api in ray.train but allow for session to be None without any errors raised.
         self._device = _TorchAccelerator().get_device()
 
         # if the module is a MultiAgentRLModule and nn.Module we can simply assume
@@ -190,7 +153,6 @@ def get_optimizer_obj(
         lr = self.optimizer_config.get("lr", 1e-3)
         return optimizer_cls(module.parameters(), lr=lr)
 
-
     @override(RLTrainer)
     def add_module(
         self,
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 947e0d8eb6de..c7744a3950de 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -60,7 +60,7 @@ def __init__(
         # the only part of this class that is framework agnostic:
         if issubclass(trainer_class, TorchRLTrainer):
             from ray.train.torch import TorchConfig
-        
+
             backend_config = TorchConfig()
         elif issubclass(trainer_class, TfRLTrainer):
             from ray.train.tensorflow import TensorflowConfig

From 2cc51854250476a5a5a82a5e11e5b9c922a2f3cd Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 17 Jan 2023 14:56:05 -0800
Subject: [PATCH 017/112] merged tf and torch train_runner tests

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../rl_trainer/tests/test_trainer_runner.py   | 266 ++++++++++--------
 1 file changed, 154 insertions(+), 112 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index e64c143d7b32..0c62e0419acc 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -2,14 +2,73 @@
 import unittest
 
 import tensorflow as tf
+import torch
 import ray
 
+from typing import Type, Union
+
+from ray.rllib.core.rl_module import RLModule
+from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
-from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
-from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
 
+Optimizer = Union[tf.keras.optimizers.Optimizer, torch.optim.Optimizer]
+
+
+def _get_trainer_class(framework: str) -> Type[RLTrainer]:
+    if framework == "tf":
+        from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
+
+        return BCTfRLTrainer
+    elif framework == "torch":
+        from ray.rllib.core.testing.torch.bc_rl_trainer import BCTorchRLTrainer
+
+        return BCTorchRLTrainer
+    else:
+        raise ValueError(f"Unsupported framework: {framework}")
+
+
+def _get_module_class(framework: str) -> Type[RLModule]:
+    if framework == "tf":
+        from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
+
+        return DiscreteBCTFModule
+    elif framework == "torch":
+        from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
+
+        return DiscreteBCTorchModule
+    else:
+        raise ValueError(f"Unsupported framework: {framework}")
+
+
+def _get_optimizer_default_class(framework: str) -> Type[Optimizer]:
+    if framework == "tf":
+        return tf.keras.optimizers.Adam
+    elif framework == "torch":
+        return torch.optim.Adam
+    else:
+        raise ValueError(f"Unsupported framework: {framework}")
+
+
+def _get_trainer_runner(
+    framework: str, env: gym.Env, compute_config: dict
+) -> TrainerRunner:
+    trainer_class = _get_trainer_class(framework)
+    trainer_cfg = dict(
+        module_class=_get_module_class(framework),
+        module_kwargs={
+            "observation_space": env.observation_space,
+            "action_space": env.action_space,
+            "model_config": {"hidden_dim": 32},
+        },
+        optimizer_config={"lr": 0.1},
+        in_test=True,
+    )
+    runner = TrainerRunner(trainer_class, trainer_cfg, compute_config=compute_config)
+
+    return runner
+
 
 class TestTrainerRunner(unittest.TestCase):
     """This test is setup for 2 gpus."""
@@ -26,125 +85,108 @@ def tearDown(cls) -> None:
 
     def test_update_multigpu(self):
         """Test training in a 2 gpu setup and that weights are synchronized."""
-        env = gym.make("CartPole-v1")
-        trainer_class = BCTfRLTrainer
-        trainer_cfg = dict(
-            module_class=DiscreteBCTFModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                "model_config": {"hidden_dim": 32},
-            },
-            optimizer_config={"lr": 1e-3},
-            in_test=True,
-        )
-        runner = TrainerRunner(
-            trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
-        )
-
-        reader = get_cartpole_dataset_reader(batch_size=500)
-
-        min_loss = float("inf")
-        for iter_i in range(1000):
-            batch = reader.next()
-            results_worker_0, results_worker_1 = runner.update(batch.as_multi_agent())
-
-            loss = (
-                results_worker_0["loss"]["total_loss"]
-                + results_worker_1["loss"]["total_loss"]
-            ) / 2
-            min_loss = min(loss, min_loss)
-            print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}")
-            # The loss is initially around 0.69 (ln2). When it gets to around
-            # 0.57 the return of the policy gets to around 100.
-            if min_loss < 0.57:
-                break
-            self.assertEqual(
-                results_worker_0["mean_weight"]["default_policy"],
-                results_worker_1["mean_weight"]["default_policy"],
-            )
-        self.assertLess(min_loss, 0.57)
 
-    def test_add_remove_module(self):
-        env = gym.make("CartPole-v1")
-        trainer_class = BCTfRLTrainer
-        trainer_cfg = dict(
-            module_class=DiscreteBCTFModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                "model_config": {"hidden_dim": 32},
-            },
-            optimizer_config={"lr": 1e-3},
-            in_test=True,
-        )
-        runner = TrainerRunner(
-            trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
-        )
-
-        reader = get_cartpole_dataset_reader(batch_size=500)
-        batch = reader.next()
-
-        # update once with the default policy
-        results = runner.update(batch.as_multi_agent())
-        module_ids_before_add = {DEFAULT_POLICY_ID}
-        new_module_id = "test_module"
-
-        # add a test_module
-        runner.add_module(
-            module_id=new_module_id,
-            module_cls=DiscreteBCTFModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                "model_config": {"hidden_dim": 32},
-            },
-            optimizer_cls=tf.keras.optimizers.Adam,
-        )
-
-        # do training that includes the test_module
-        results = runner.update(
-            MultiAgentBatch(
-                {new_module_id: batch, DEFAULT_POLICY_ID: batch}, batch.count
-            )
-        )
-
-        # check that module weights are updated across workers and synchronized
-        for i in range(1, len(results)):
-            for module_id in results[i]["mean_weight"].keys():
-                assert (
-                    results[i]["mean_weight"][module_id]
-                    == results[i - 1]["mean_weight"][module_id]
+        for fw in ["torch"]:  # , "torch"]:
+            print(f"Testing framework: {fw}.")
+            env = gym.make("CartPole-v1")
+            runner = _get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
+            reader = get_cartpole_dataset_reader(batch_size=500)
+
+            min_loss = float("inf")
+            for iter_i in range(1000):
+                batch = reader.next()
+                res_0, res_1 = runner.update(batch.as_multi_agent())
+
+                loss = (res_0["loss"]["total_loss"] + res_1["loss"]["total_loss"]) / 2
+                min_loss = min(loss, min_loss)
+                print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}")
+                # The loss is initially around 0.69 (ln2). When it gets to around
+                # 0.57 the return of the policy gets to around 100.
+                if min_loss < 0.57:
+                    break
+                self.assertEqual(
+                    res_0["mean_weight"]["default_policy"],
+                    res_1["mean_weight"]["default_policy"],
                 )
+            self.assertLess(min_loss, 0.57)
 
-        # check that module ids are updated to include the new module
-        module_ids_after_add = {DEFAULT_POLICY_ID, new_module_id}
-        for result in results:
-            # remove the total_loss key since its not a module key
-            self.assertEqual(set(result["loss"]) - {"total_loss"}, module_ids_after_add)
+            # make sure the runner resources are freed up
+            del runner
 
-        # remove the test_module
-        runner.remove_module(module_id=new_module_id)
+    def test_add_remove_module(self):
 
-        # run training without the test_module
-        results = runner.update(batch.as_multi_agent())
+        for fw in ["torch"]:  # , "torch"]:
+            print(f"Testing framework: {fw}.")
+            env = gym.make("CartPole-v1")
+            runner = _get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
+            reader = get_cartpole_dataset_reader(batch_size=500)
+            batch = reader.next()
 
-        # check that module weights are updated across workers and synchronized
-        for i in range(1, len(results)):
-            for module_id in results[i]["mean_weight"].keys():
-                assert (
-                    results[i]["mean_weight"][module_id]
-                    == results[i - 1]["mean_weight"][module_id]
-                )
+            # update once with the default policy
+            results = runner.update(batch.as_multi_agent())
+            module_ids_before_add = {DEFAULT_POLICY_ID}
+            new_module_id = "test_module"
+
+            # add a test_module
+            runner.add_module(
+                module_id=new_module_id,
+                module_cls=_get_module_class(fw),
+                module_kwargs={
+                    "observation_space": env.observation_space,
+                    "action_space": env.action_space,
+                    "model_config": {"hidden_dim": 32},
+                },
+                optimizer_cls=_get_optimizer_default_class(fw),
+            )
 
-        # check that module ids are updated after remove operation to not
-        # include the new module
-        for result in results:
-            # remove the total_loss key since its not a module key
-            self.assertEqual(
-                set(result["loss"]) - {"total_loss"}, module_ids_before_add
+            # do training that includes the test_module
+            results = runner.update(
+                MultiAgentBatch(
+                    {new_module_id: batch, DEFAULT_POLICY_ID: batch}, batch.count
+                )
             )
 
+            # check that module weights are updated across workers and synchronized
+            for i in range(1, len(results)):
+                for module_id in results[i]["mean_weight"].keys():
+                    assert (
+                        results[i]["mean_weight"][module_id]
+                        == results[i - 1]["mean_weight"][module_id]
+                    )
+
+            # check that module ids are updated to include the new module
+            module_ids_after_add = {DEFAULT_POLICY_ID, new_module_id}
+            for result in results:
+                # remove the total_loss key since its not a module key
+                self.assertEqual(
+                    set(result["loss"]) - {"total_loss"}, module_ids_after_add
+                )
+
+            # remove the test_module
+            runner.remove_module(module_id=new_module_id)
+
+            # run training without the test_module
+            results = runner.update(batch.as_multi_agent())
+
+            # check that module weights are updated across workers and synchronized
+            for i in range(1, len(results)):
+                for module_id in results[i]["mean_weight"].keys():
+                    assert (
+                        results[i]["mean_weight"][module_id]
+                        == results[i - 1]["mean_weight"][module_id]
+                    )
+
+            # check that module ids are updated after remove operation to not
+            # include the new module
+            for result in results:
+                # remove the total_loss key since its not a module key
+                self.assertEqual(
+                    set(result["loss"]) - {"total_loss"}, module_ids_before_add
+                )
+
+            # make sure the runner resources are freed up
+            del runner
+
 
 if __name__ == "__main__":
     import pytest

From 435c3521a26a8635ee6120a287b019f8a1bc58fa Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 17 Jan 2023 16:34:04 -0800
Subject: [PATCH 018/112] fixed trainer_runner auto-scaling on a cluster where
 autoscaling is enabled

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../rl_trainer/tests/test_trainer_runner.py   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 0c62e0419acc..4ffd3fe57f3a 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -4,6 +4,7 @@
 import tensorflow as tf
 import torch
 import ray
+import time
 
 from typing import Type, Union
 
@@ -73,8 +74,9 @@ def _get_trainer_runner(
 class TestTrainerRunner(unittest.TestCase):
     """This test is setup for 2 gpus."""
 
-    # TODO: Make a unittest that does not need 2 gpus to run.
-    # So that the user can run it locally as well.
+    # TODO: This unittest should also test other resource allocations like multi-cpu,
+    # multi-node multi-gpu, etc.
+
     @classmethod
     def setUp(cls) -> None:
         ray.init()
@@ -86,7 +88,8 @@ def tearDown(cls) -> None:
     def test_update_multigpu(self):
         """Test training in a 2 gpu setup and that weights are synchronized."""
 
-        for fw in ["torch"]:  # , "torch"]:
+        for fw in ["tf", "torch"]:
+            ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
             runner = _get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
@@ -110,12 +113,15 @@ def test_update_multigpu(self):
                 )
             self.assertLess(min_loss, 0.57)
 
-            # make sure the runner resources are freed up
+            # make sure the runner resources are freed up so that we don't autoscale
             del runner
+            ray.shutdown()
+            time.sleep(10)
 
     def test_add_remove_module(self):
 
-        for fw in ["torch"]:  # , "torch"]:
+        for fw in ["tf", "torch"]:
+            ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
             runner = _get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
@@ -184,8 +190,10 @@ def test_add_remove_module(self):
                     set(result["loss"]) - {"total_loss"}, module_ids_before_add
                 )
 
-            # make sure the runner resources are freed up
+            # make sure the runner resources are freed up so that we don't autoscale
             del runner
+            ray.shutdown()
+            time.sleep(10)
 
 
 if __name__ == "__main__":

From ff845c3aa7caf7514ff2234014b77f4696305c3b Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 17 Jan 2023 16:53:05 -0800
Subject: [PATCH 019/112] fix rl_trainer unittest failures.

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/torch/torch_rl_module.py |   5 +-
 .../tests/test_trainer_runner_torch.py        | 153 ------------------
 .../core/rl_trainer/torch/torch_rl_trainer.py |  19 ++-
 rllib/utils/test_utils.py                     |   2 +-
 4 files changed, 18 insertions(+), 161 deletions(-)
 delete mode 100644 rllib/core/rl_trainer/tests/test_trainer_runner_torch.py

diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py
index 776cf42df480..26995e278650 100644
--- a/rllib/core/rl_module/torch/torch_rl_module.py
+++ b/rllib/core/rl_module/torch/torch_rl_module.py
@@ -71,8 +71,9 @@ def set_state(self, *args, **kwargs):
     @override(RLModule)
     def make_distributed(self, dist_config: Mapping[str, Any] = None) -> None:
         # TODO (Kourosh): Not to sure about this make_distributed api belonging to
-        # RLModule or not? we should see if we use this api end-point for both tf and
-        # torch instead of doing it in the trainer.
+        # RLModule or the RLTrainer? For now the logic is kept in RLTrainer.
+        # We should see if we can use this api end-point for both tf
+        # and torch instead of doing it in the trainer.
         pass
 
     @override(RLModule)
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py b/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
deleted file mode 100644
index 121dbf7452c6..000000000000
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_torch.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import gymnasium as gym
-import unittest
-
-import torch
-import ray
-
-from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
-from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
-from ray.rllib.core.testing.torch.bc_rl_trainer import BCTorchRLTrainer
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
-from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
-
-
-class TestTrainerRunner(unittest.TestCase):
-    """This test is setup for 2 gpus."""
-
-    # TODO: Make a unittest that does not need 2 gpus to run.
-    # So that the user can run it locally as well.
-    @classmethod
-    def setUp(cls) -> None:
-        ray.init()
-
-    @classmethod
-    def tearDown(cls) -> None:
-        ray.shutdown()
-
-    def test_update_multigpu(self):
-        """Test training in a 2 gpu setup and that weights are synchronized."""
-        env = gym.make("CartPole-v1")
-        trainer_class = BCTorchRLTrainer
-        trainer_cfg = dict(
-            module_class=DiscreteBCTorchModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                "model_config": {"hidden_dim": 32},
-            },
-            optimizer_config={"lr": 0.1},
-            in_test=True,
-        )
-        runner = TrainerRunner(
-            trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
-        )
-
-        reader = get_cartpole_dataset_reader(batch_size=500)
-
-        min_loss = float("inf")
-        for iter_i in range(1000):
-            batch = reader.next()
-            results_worker_0, results_worker_1 = runner.update(batch.as_multi_agent())
-
-            loss = (
-                results_worker_0["loss"]["total_loss"]
-                + results_worker_1["loss"]["total_loss"]
-            ) / 2
-            min_loss = min(loss, min_loss)
-            print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}")
-            # The loss is initially around 0.69 (ln2). When it gets to around
-            # 0.57 the return of the policy gets to around 100.
-            if min_loss < 0.57:
-                break
-            self.assertEqual(
-                results_worker_0["mean_weight"]["default_policy"],
-                results_worker_1["mean_weight"]["default_policy"],
-            )
-        self.assertLess(min_loss, 0.57)
-
-    def test_add_remove_module(self):
-        env = gym.make("CartPole-v1")
-        trainer_class = BCTorchRLTrainer
-        trainer_cfg = dict(
-            module_class=DiscreteBCTorchModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                "model_config": {"hidden_dim": 32},
-            },
-            optimizer_config={"lr": 0.1},
-            in_test=True,
-        )
-        runner = TrainerRunner(
-            trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
-        )
-
-        reader = get_cartpole_dataset_reader(batch_size=500)
-        batch = reader.next()
-
-        # update once with the default policy
-        results = runner.update(batch.as_multi_agent())
-        module_ids_before_add = {DEFAULT_POLICY_ID}
-        new_module_id = "test_module"
-
-        # add a test_module
-        runner.add_module(
-            module_id=new_module_id,
-            module_cls=DiscreteBCTorchModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                "model_config": {"hidden_dim": 32},
-            },
-            optimizer_cls=torch.optim.Adam,
-        )
-
-        # do training that includes the test_module
-        results = runner.update(
-            MultiAgentBatch(
-                {new_module_id: batch, DEFAULT_POLICY_ID: batch}, batch.count
-            )
-        )
-
-        # check that module weights are updated across workers and synchronized
-        for i in range(1, len(results)):
-            for module_id in results[i]["mean_weight"].keys():
-                assert (
-                    results[i]["mean_weight"][module_id]
-                    == results[i - 1]["mean_weight"][module_id]
-                )
-
-        # check that module ids are updated to include the new module
-        module_ids_after_add = {DEFAULT_POLICY_ID, new_module_id}
-        for result in results:
-            # remove the total_loss key since its not a module key
-            self.assertEqual(set(result["loss"]) - {"total_loss"}, module_ids_after_add)
-
-        # remove the test_module
-        runner.remove_module(module_id=new_module_id)
-
-        # run training without the test_module
-        results = runner.update(batch.as_multi_agent())
-
-        # check that module weights are updated across workers and synchronized
-        for i in range(1, len(results)):
-            for module_id in results[i]["mean_weight"].keys():
-                assert (
-                    results[i]["mean_weight"][module_id]
-                    == results[i - 1]["mean_weight"][module_id]
-                )
-
-        # check that module ids are updated after remove operation to not
-        # include the new module
-        for result in results:
-            # remove the total_loss key since its not a module key
-            self.assertEqual(
-                set(result["loss"]) - {"total_loss"}, module_ids_before_add
-            )
-
-
-if __name__ == "__main__":
-    import pytest
-    import sys
-
-    sys.exit(pytest.main(["-v", __file__]))
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 50ca2a1aa434..d7aa25747087 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -52,6 +52,8 @@ def __init__(
 
         self._world_size = scaling_config.get("num_workers", 1)
         self._use_gpu = scaling_config.get("use_gpu", False)
+        # These attributes are set in the `build` method.
+        self._device = None
 
     @property
     @override(RLTrainer)
@@ -96,13 +98,19 @@ def apply_gradients(self, gradients: ParamDictType) -> None:
             optim.step()
 
     @override(RLTrainer)
-    def _make_distributed(self) -> MultiAgentRLModule:
-        module = self._make_module()
-
+    def build(self) -> None:
         # TODO (Kourosh): How do we handle model parallism?
         # TODO (Kourosh): Instead of using _TorchAccelerator, we should use the public
         # api in ray.train but allow for session to be None without any errors raised.
-        self._device = _TorchAccelerator().get_device()
+        if self._use_gpu:
+            self._device = _TorchAccelerator().get_device()
+        else:
+            self._device = torch.device("cpu")
+        super().build()
+
+    @override(RLTrainer)
+    def _make_distributed(self) -> MultiAgentRLModule:
+        module = self._make_module()
 
         # if the module is a MultiAgentRLModule and nn.Module we can simply assume
         # all the submodules are registered. Otherwise, we need to loop through
@@ -173,4 +181,5 @@ def add_module(
 
         # we need to ddpify the module that was just added to the pool
         self._module[module_id].to(self._device)
-        self._module[module_id] = DDPRLModuleWrapper(self._module[module_id])
+        if self.distributed:
+            self._module[module_id] = DDPRLModuleWrapper(self._module[module_id])
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index 4a8e300da1b7..5f21b836fb85 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -1146,7 +1146,7 @@ def get_cartpole_dataset_reader(batch_size: int = 1) -> "DatasetReader":
         get_dataset_and_shards,
     )
 
-    path = "rllib/tests/data/cartpole/large.json"
+    path = "tests/data/cartpole/large.json"
     input_config = {"format": "json", "paths": path}
     dataset, _ = get_dataset_and_shards(
         AlgorithmConfig().offline_data(input_="dataset", input_config=input_config)

From 7c3eed7819a91986b7a24d04487ce13ff20f6e9f Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 09:08:26 -0800
Subject: [PATCH 020/112] 1. renamed the DDP wrapper 2. don't do numpy
 conversion for batch on the base class

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/torch/__init__.py          | 4 ++--
 rllib/core/rl_module/torch/torch_rl_module.py   | 7 ++++++-
 rllib/core/rl_trainer/rl_trainer.py             | 1 -
 rllib/core/rl_trainer/torch/torch_rl_trainer.py | 8 ++++----
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/rllib/core/rl_module/torch/__init__.py b/rllib/core/rl_module/torch/__init__.py
index 217c5a5abc8a..1801069e47dc 100644
--- a/rllib/core/rl_module/torch/__init__.py
+++ b/rllib/core/rl_module/torch/__init__.py
@@ -1,3 +1,3 @@
-from .torch_rl_module import TorchRLModule
+from .torch_rl_module import TorchDDPRLModule
 
-__all__ = ["TorchRLModule"]
+__all__ = ["TorchDDPRLModule"]
diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py
index 26995e278650..8451800af393 100644
--- a/rllib/core/rl_module/torch/torch_rl_module.py
+++ b/rllib/core/rl_module/torch/torch_rl_module.py
@@ -47,7 +47,12 @@ def is_distributed(self) -> bool:
         return False
 
 
-class DDPRLModuleWrapper(DDP, RLModule):
+class TorchDDPRLModule(DDP, RLModule):
+    def __init__(self, *args, **kwargs) -> None:
+        DDP.__init__(self, *args, **kwargs)
+        # we do not want to call RLModule.__init__ here because it will all we need is
+        # the interface of that base-class not the actual implementation.
+
     @override(RLModule)
     def _forward_train(self, *args, **kwargs):
         return self(*args, **kwargs)
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 07846eb05866..387cc0de86eb 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -222,7 +222,6 @@ def compile_results(
         # true for centralized critic for example. Therefore we need a better
         # generalization of this base-class implementation.
         loss_numpy = convert_to_numpy(postprocessed_loss)
-        batch = convert_to_numpy(batch)
         mean_grads = [
             np.mean(grad)
             for grad in convert_to_numpy(post_processed_gradients.values())
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index d7aa25747087..7677425daa7e 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -22,7 +22,7 @@
     ParamType,
     ParamDictType,
 )
-from ray.rllib.core.rl_module.torch.torch_rl_module import DDPRLModuleWrapper
+from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
@@ -120,11 +120,11 @@ def _make_distributed(self) -> MultiAgentRLModule:
         # handle this.
         if isinstance(module, torch.nn.Module):
             module.to(self._device)
-            module = DDPRLModuleWrapper(module)
+            module = TorchDDPRLModule(module)
         else:
             for key in module.keys():
                 module[key].to(self._device)
-                module[key] = DDPRLModuleWrapper(module[key])
+                module[key] = TorchDDPRLModule(module[key])
 
         return module
 
@@ -182,4 +182,4 @@ def add_module(
         # we need to ddpify the module that was just added to the pool
         self._module[module_id].to(self._device)
         if self.distributed:
-            self._module[module_id] = DDPRLModuleWrapper(self._module[module_id])
+            self._module[module_id] = TorchDDPRLModule(self._module[module_id])

From d56ce2c06997e2d71c7aeec472d85113040e2c7d Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 09:16:01 -0800
Subject: [PATCH 021/112] removed in_test from the production code

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           | 13 +-----------
 .../core/rl_trainer/tests/test_rl_trainer.py  |  1 -
 .../rl_trainer/tests/test_torch_rl_trainer.py |  1 -
 .../rl_trainer/tests/test_trainer_runner.py   |  1 -
 .../rl_trainer/tests/tf/test_tf_rl_trainer.py |  2 --
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  2 --
 .../core/rl_trainer/torch/torch_rl_trainer.py |  2 --
 rllib/core/testing/testing_trainer.py         | 21 +++++++++++++++++++
 rllib/core/testing/tf/bc_rl_trainer.py        |  3 ++-
 rllib/core/testing/torch/bc_rl_trainer.py     |  3 ++-
 10 files changed, 26 insertions(+), 23 deletions(-)
 create mode 100644 rllib/core/testing/testing_trainer.py

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 387cc0de86eb..3cb6f2ba332a 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -105,7 +105,6 @@ def __init__(
         scaling_config: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
-        in_test: bool = False,
     ):
         # TODO (Kourosh): convert scaling and optimizer configs to dataclasses
         self.module_class = module_class
@@ -113,7 +112,6 @@ def __init__(
         self.scaling_config = scaling_config
         self.optimizer_config = optimizer_config
         self.distributed = distributed
-        self.in_test = in_test
 
         # These are the attributes that are set during build
         self._module: MultiAgentRLModule = None
@@ -230,16 +228,7 @@ def compile_results(
             "loss": loss_numpy,
             "mean_gradient": np.mean(mean_grads),
         }
-
-        if self.in_test:
-            # this is to check if in the multi-gpu case, the weights across workers are
-            # the same. It is really only needed during testing.
-            mean_ws = {}
-            for module_id in self._module.keys():
-                m = self._module[module_id]
-                parameters = convert_to_numpy(self.get_parameters(m))
-                mean_ws[module_id] = np.mean([w.mean() for w in parameters])
-            ret["mean_weight"] = mean_ws
+        
         return ret
 
     def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index 1ba9b7cca931..ed1b77018136 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -32,7 +32,6 @@ def get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
         scaling_config=scaling_config,
         optimizer_config={"lr": 1e-3},
         distributed=distributed,
-        in_test=True,
     )
 
     trainer.build()
diff --git a/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
index 3a6c5d0e3af8..d321f17cced0 100644
--- a/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
@@ -33,7 +33,6 @@ def get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
         scaling_config=scaling_config,
         optimizer_config={"lr": 1e-3},
         distributed=distributed,
-        in_test=True,
     )
 
     trainer.build()
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 4ffd3fe57f3a..45b48f476629 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -64,7 +64,6 @@ def _get_trainer_runner(
             "model_config": {"hidden_dim": 32},
         },
         optimizer_config={"lr": 0.1},
-        in_test=True,
     )
     runner = TrainerRunner(trainer_class, trainer_cfg, compute_config=compute_config)
 
diff --git a/rllib/core/rl_trainer/tests/tf/test_tf_rl_trainer.py b/rllib/core/rl_trainer/tests/tf/test_tf_rl_trainer.py
index 9e5578cd37b3..19ed112dfeae 100644
--- a/rllib/core/rl_trainer/tests/tf/test_tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/tf/test_tf_rl_trainer.py
@@ -38,7 +38,6 @@ def test_update_multigpu(self):
                 "model_config": {"hidden_dim": 32},
             },
             optimizer_config={"lr": 1e-3},
-            in_test=True,
         )
         runner = TrainerRunner(
             trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
@@ -79,7 +78,6 @@ def test_add_remove_module(self):
                 "model_config": {"hidden_dim": 32},
             },
             optimizer_config={"lr": 1e-3},
-            in_test=True,
         )
         runner = TrainerRunner(
             trainer_class, trainer_cfg, compute_config=dict(num_gpus=2)
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index ffb448b7379a..06db5f94d6c1 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -91,7 +91,6 @@ def __init__(
         scaling_config: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
-        in_test: bool = False,
         enable_tf_function: bool = True,
     ):
         super().__init__(
@@ -100,7 +99,6 @@ def __init__(
             scaling_config=scaling_config,
             optimizer_config=optimizer_config,
             distributed=distributed,
-            in_test=in_test,
         )
 
         self._enable_tf_function = enable_tf_function
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 7677425daa7e..08d5af3af43b 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -39,7 +39,6 @@ def __init__(
         scaling_config: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
-        in_test: bool = False,
     ):
         super().__init__(
             module_class=module_class,
@@ -47,7 +46,6 @@ def __init__(
             scaling_config=scaling_config,
             optimizer_config=optimizer_config,
             distributed=distributed,
-            in_test=in_test,
         )
 
         self._world_size = scaling_config.get("num_workers", 1)
diff --git a/rllib/core/testing/testing_trainer.py b/rllib/core/testing/testing_trainer.py
new file mode 100644
index 000000000000..f596f8ad0fde
--- /dev/null
+++ b/rllib/core/testing/testing_trainer.py
@@ -0,0 +1,21 @@
+from typing import Mapping, Any
+import numpy as np
+
+from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
+from ray.rllib.utils.nested_dict import NestedDict
+from ray.rllib.utils.numpy import convert_to_numpy
+
+class BaseTestingTrainer(RLTrainer):
+
+    def compile_results(self, batch: NestedDict, fwd_out: Mapping[str, Any], postprocessed_loss: Mapping[str, Any], post_processed_gradients: Mapping[str, Any]) -> Mapping[str, Any]:
+        results = super().compile_results(batch, fwd_out, postprocessed_loss, post_processed_gradients)
+        # this is to check if in the multi-gpu case, the weights across workers are
+        # the same. It is really only needed during testing.
+        mean_ws = {}
+        for module_id in self._module.keys():
+            m = self._module[module_id]
+            parameters = convert_to_numpy(self.get_parameters(m))
+            mean_ws[module_id] = np.mean([w.mean() for w in parameters])
+        results["mean_weight"] = mean_ws
+
+        return results
\ No newline at end of file
diff --git a/rllib/core/testing/tf/bc_rl_trainer.py b/rllib/core/testing/tf/bc_rl_trainer.py
index 232a7dc59662..5bbbce494f08 100644
--- a/rllib/core/testing/tf/bc_rl_trainer.py
+++ b/rllib/core/testing/tf/bc_rl_trainer.py
@@ -4,9 +4,10 @@
 from ray.rllib.core.rl_trainer.tf.tf_rl_trainer import TfRLTrainer
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.sample_batch import MultiAgentBatch
+from ray.rllib.core.testing.testing_trainer import BaseTestingTrainer
 
 
-class BCTfRLTrainer(TfRLTrainer):
+class BCTfRLTrainer(TfRLTrainer, BaseTestingTrainer):
     def compute_loss(
         self, fwd_out: MultiAgentBatch, batch: MultiAgentBatch
     ) -> Mapping[str, Any]:
diff --git a/rllib/core/testing/torch/bc_rl_trainer.py b/rllib/core/testing/torch/bc_rl_trainer.py
index 82db03d0aeae..772dd7cfaa84 100644
--- a/rllib/core/testing/torch/bc_rl_trainer.py
+++ b/rllib/core/testing/torch/bc_rl_trainer.py
@@ -4,9 +4,10 @@
 from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.sample_batch import MultiAgentBatch
+from ray.rllib.core.testing.testing_trainer import BaseTestingTrainer
 
 
-class BCTorchRLTrainer(TorchRLTrainer):
+class BCTorchRLTrainer(TorchRLTrainer, BaseTestingTrainer):
     def compute_loss(
         self, fwd_out: MultiAgentBatch, batch: MultiAgentBatch
     ) -> Mapping[str, Any]:

From 200b5f754b2d647efb6ffd11945eb9c39ae57bcc Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 09:19:32 -0800
Subject: [PATCH 022/112] clarified todo

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 3cb6f2ba332a..6c418bb71088 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -495,7 +495,8 @@ def get_parameters(self, module: RLModule) -> Sequence[ParamType]:
         Returns:
             The parameters of the module.
         """
-        # TODO (Kourosh): Make this method a classmethod
+        # TODO (Kourosh): Make this method a classmethod. This function's purpose is to 
+        # get the parameters of a module based on what the underlying framework is. 
 
     @abc.abstractmethod
     def get_optimizer_obj(

From f747e50cc1244d6b3755141c593d9f30b8e4dc4b Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 10:14:01 -0800
Subject: [PATCH 023/112] comments

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           | 25 +++++++++----------
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     | 11 ++++----
 .../core/rl_trainer/torch/torch_rl_trainer.py |  3 ++-
 rllib/core/testing/testing_trainer.py         | 16 +++++++++---
 4 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 6c418bb71088..e02676bcb365 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -177,7 +177,7 @@ def compute_loss(
         # should find a way to allow them to specify single-agent losses as well,
         # without having to think about one extra layer of hierarchy for module ids.
 
-    def on_after_compute_gradients(
+    def postprocess_gradients(
         self, gradients_dict: Mapping[str, Any]
     ) -> Mapping[str, Any]:
         """Called after gradients have been computed.
@@ -189,8 +189,8 @@ def on_after_compute_gradients(
             fwd_out = forward_train(batch)
             loss = compute_loss(batch, fwd_out)
             gradients = compute_gradients(loss)
-            ---> post_processed_gradients = on_after_compute_gradients(gradients)
-            apply_gradients(post_processed_gradients)
+            ---> postprocessed_gradients = postprocess_gradients(gradients)
+            apply_gradients(postprocessed_gradients)
 
         Returns:
             Mapping[str, Any]: A dictionary of gradients.
@@ -202,7 +202,7 @@ def compile_results(
         batch: NestedDict,
         fwd_out: Mapping[str, Any],
         postprocessed_loss: Mapping[str, Any],
-        post_processed_gradients: Mapping[str, Any],
+        postprocessed_gradients: Mapping[str, Any],
     ) -> Mapping[str, Any]:
         """Compile results from the update.
 
@@ -210,7 +210,7 @@ def compile_results(
             batch: The batch that was used for the update.
             fwd_out: The output of the forward train pass.
             postprocessed_loss: The loss after postprocessing.
-            post_processed_gradients: The gradients after postprocessing.
+            postprocessed_gradients: The gradients after postprocessing.
 
         Returns:
             A dictionary of results.
@@ -221,14 +221,13 @@ def compile_results(
         # generalization of this base-class implementation.
         loss_numpy = convert_to_numpy(postprocessed_loss)
         mean_grads = [
-            np.mean(grad)
-            for grad in convert_to_numpy(post_processed_gradients.values())
+            np.mean(grad) for grad in convert_to_numpy(postprocessed_gradients.values())
         ]
         ret = {
             "loss": loss_numpy,
             "mean_gradient": np.mean(mean_grads),
         }
-        
+
         return ret
 
     def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
@@ -252,9 +251,9 @@ def _update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         fwd_out = self._module.forward_train(batch)
         loss = self.compute_loss(fwd_out=fwd_out, batch=batch)
         gradients = self.compute_gradients(loss)
-        post_processed_gradients = self.on_after_compute_gradients(gradients)
-        self.apply_gradients(post_processed_gradients)
-        return self.compile_results(batch, fwd_out, loss, post_processed_gradients)
+        postprocessed_gradients = self.postprocess_gradients(gradients)
+        self.apply_gradients(postprocessed_gradients)
+        return self.compile_results(batch, fwd_out, loss, postprocessed_gradients)
 
     @abc.abstractmethod
     def _convert_batch_type(self, batch: MultiAgentBatch) -> NestedDict[TensorType]:
@@ -495,8 +494,8 @@ def get_parameters(self, module: RLModule) -> Sequence[ParamType]:
         Returns:
             The parameters of the module.
         """
-        # TODO (Kourosh): Make this method a classmethod. This function's purpose is to 
-        # get the parameters of a module based on what the underlying framework is. 
+        # TODO (Kourosh): Make this method a classmethod. This function's purpose is to
+        # get the parameters of a module based on what the underlying framework is.
 
     @abc.abstractmethod
     def get_optimizer_obj(
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 06db5f94d6c1..93c24d658947 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -114,13 +114,14 @@ def _do_update_fn(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
             if isinstance(loss, tf.Tensor):
                 loss = {"total_loss": loss}
         gradients = self.compute_gradients(loss, tape)
-        gradients = self.on_after_compute_gradients(gradients)
+        gradients = self.postprocess_gradients(gradients)
         self.apply_gradients(gradients)
-        return {"loss": loss, "fwd_out": fwd_out, "post_processed_gradients": gradients}
+        return {"loss": loss, "fwd_out": fwd_out, "postprocessed_gradients": gradients}
 
     @override(RLTrainer)
     def configure_optimizers(self) -> ParamOptimizerPairs:
-        lr = self.optimizer_config.get("lr", 1e-3)
+        # TODO (Kourosh): convert optimizer_config to dataclass later.
+        lr = self.optimizer_config["lr"]
         return [
             (
                 self._module[key].trainable_variables,
@@ -138,8 +139,8 @@ def update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
             update_outs = self._update_fn(batch)
         loss = update_outs["loss"]
         fwd_out = update_outs["fwd_out"]
-        post_processed_gradients = update_outs["post_processed_gradients"]
-        results = self.compile_results(batch, fwd_out, loss, post_processed_gradients)
+        postprocessed_gradients = update_outs["postprocessed_gradients"]
+        results = self.compile_results(batch, fwd_out, loss, postprocessed_gradients)
         return results
 
     @override(RLTrainer)
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 08d5af3af43b..8d8d53471f43 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -60,7 +60,8 @@ def module(self) -> MultiAgentRLModule:
 
     @override(RLTrainer)
     def configure_optimizers(self) -> ParamOptimizerPairs:
-        lr = self.optimizer_config.get("lr", 1e-3)
+        # TODO (Kourosh): convert optimizer_config to dataclass later.
+        lr = self.optimizer_config["lr"]
         return [
             (
                 self.get_parameters(self._module[key]),
diff --git a/rllib/core/testing/testing_trainer.py b/rllib/core/testing/testing_trainer.py
index f596f8ad0fde..712b73859aae 100644
--- a/rllib/core/testing/testing_trainer.py
+++ b/rllib/core/testing/testing_trainer.py
@@ -5,10 +5,18 @@
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.numpy import convert_to_numpy
 
-class BaseTestingTrainer(RLTrainer):
 
-    def compile_results(self, batch: NestedDict, fwd_out: Mapping[str, Any], postprocessed_loss: Mapping[str, Any], post_processed_gradients: Mapping[str, Any]) -> Mapping[str, Any]:
-        results = super().compile_results(batch, fwd_out, postprocessed_loss, post_processed_gradients)
+class BaseTestingTrainer(RLTrainer):
+    def compile_results(
+        self,
+        batch: NestedDict,
+        fwd_out: Mapping[str, Any],
+        postprocessed_loss: Mapping[str, Any],
+        postprocessed_gradients: Mapping[str, Any],
+    ) -> Mapping[str, Any]:
+        results = super().compile_results(
+            batch, fwd_out, postprocessed_loss, postprocessed_gradients
+        )
         # this is to check if in the multi-gpu case, the weights across workers are
         # the same. It is really only needed during testing.
         mean_ws = {}
@@ -18,4 +26,4 @@ def compile_results(self, batch: NestedDict, fwd_out: Mapping[str, Any], postpro
             mean_ws[module_id] = np.mean([w.mean() for w in parameters])
         results["mean_weight"] = mean_ws
 
-        return results
\ No newline at end of file
+        return results

From 7ce81f0a251207a7f1cfdbb79eb93d2af5921858 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 10:16:31 -0800
Subject: [PATCH 024/112] renamed make_distributed to make_distributed_module

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py             | 4 ++--
 rllib/core/rl_trainer/tf/tf_rl_trainer.py       | 2 +-
 rllib/core/rl_trainer/torch/torch_rl_trainer.py | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index e02676bcb365..b2185b6282d4 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -430,7 +430,7 @@ def _make_module(self) -> MultiAgentRLModule:
     def build(self) -> None:
         """Initialize the model."""
         if self.distributed:
-            self._module = self._make_distributed()
+            self._module = self._make_distributed_module()
         else:
             self._module = self._make_module()
 
@@ -453,7 +453,7 @@ def do_distributed_update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         """
         raise NotImplementedError
 
-    def _make_distributed(self) -> MultiAgentRLModule:
+    def _make_distributed_module(self) -> MultiAgentRLModule:
         """Initialize this trainer in a distributed training setting.
 
         This method should be overriden in the framework specific trainer. It is
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 93c24d658947..a23aaf785f43 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -158,7 +158,7 @@ def apply_gradients(self, gradients: Dict[ParamRef, TensorType]) -> None:
             optim.apply_gradients(zip(gradient_list, variable_list))
 
     @override(RLTrainer)
-    def _make_distributed(self) -> MultiAgentRLModule:
+    def _make_distributed_module(self) -> MultiAgentRLModule:
         # TODO (Kourosh): Does strategy has to be an attribute here? if so it's very
         # hidden to the user of this class that there is such an attribute.
 
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 8d8d53471f43..e30aa2cb6a68 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -75,6 +75,7 @@ def compute_gradients(
         self, loss: Union[TensorType, Mapping[str, Any]]
     ) -> ParamDictType:
         for optim in self._optim_to_param:
+            # set_to_none is a faster way to zero out the gradients
             optim.zero_grad(set_to_none=True)
         loss[self.TOTAL_LOSS_KEY].backward()
         grads = {pid: p.grad for pid, p in self._params.items()}
@@ -108,7 +109,7 @@ def build(self) -> None:
         super().build()
 
     @override(RLTrainer)
-    def _make_distributed(self) -> MultiAgentRLModule:
+    def _make_distributed_module(self) -> MultiAgentRLModule:
         module = self._make_module()
 
         # if the module is a MultiAgentRLModule and nn.Module we can simply assume

From b2ddd2dbeb841a2946bb33125c2ad24e599611ac Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 10:32:26 -0800
Subject: [PATCH 025/112] fixed test torch rl_trainer lint

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/tests/test_torch_rl_trainer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
index d321f17cced0..682e76589b7a 100644
--- a/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
@@ -13,7 +13,7 @@
 from ray.rllib.utils.numpy import convert_to_numpy
 
 
-def get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
+def _get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
     env = gym.make("CartPole-v1")
     scaling_config = {} or scaling_config
     distributed = False
@@ -51,7 +51,7 @@ def tearDown(cls) -> None:
 
     def test_end_to_end_update(self):
 
-        trainer = get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config={"num_workers": 2})
         reader = get_cartpole_dataset_reader(batch_size=512)
 
         min_loss = float("inf")
@@ -74,7 +74,7 @@ def test_compute_gradients(self):
         Tests that if we sum all the trainable variables the gradient of output w.r.t.
         the weights is all ones.
         """
-        trainer = get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config={"num_workers": 2})
 
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
         loss = {"total_loss": sum([param.sum() for param in params])}
@@ -93,7 +93,7 @@ def test_apply_gradients(self):
         standard SGD/Adam update rule.
         """
 
-        trainer = get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config={"num_workers": 2})
 
         # calculated the expected new params based on gradients of all ones.
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
@@ -117,7 +117,7 @@ def test_add_remove_module(self):
         all variables the updated parameters follow the SGD update rule.
         """
         env = gym.make("CartPole-v1")
-        trainer = get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config={"num_workers": 2})
 
         # add a test module with SGD optimizer with a known lr
         lr = 1e-4

From 5bc625cb2e532ea34c368be6e299a022dc1b06eb Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 10:47:59 -0800
Subject: [PATCH 026/112] fixed marl_module stuff

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/marl_module.py             | 16 ----------------
 rllib/core/rl_module/torch/__init__.py          |  4 ++--
 rllib/core/rl_trainer/torch/torch_rl_trainer.py |  4 ++--
 3 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/rllib/core/rl_module/marl_module.py b/rllib/core/rl_module/marl_module.py
index 3c61e03b8703..70c4c65b3524 100644
--- a/rllib/core/rl_module/marl_module.py
+++ b/rllib/core/rl_module/marl_module.py
@@ -242,22 +242,6 @@ def __getitem__(self, module_id: ModuleID) -> RLModule:
         self._check_module_exists(module_id)
         return self._rl_modules[module_id]
 
-    def __setitem__(self, module_id: ModuleID, module: RLModule) -> None:
-        """Modifies an existing module and assign it to the new module object.
-
-        Args:
-            module_id: The module ID to add.
-            module: The module to add.
-        """
-        try:
-            self._check_module_exists(module_id)
-        except ValueError:
-            raise ValueError(
-                f"Module ID {module_id} does not exist. Use add_module() to add a "
-                "new module."
-            )
-        self._rl_modules[module_id] = module
-
     @override(RLModule)
     def output_specs_train(self) -> SpecDict:
         return self._get_specs_for_modules("output_specs_train")
diff --git a/rllib/core/rl_module/torch/__init__.py b/rllib/core/rl_module/torch/__init__.py
index 1801069e47dc..217c5a5abc8a 100644
--- a/rllib/core/rl_module/torch/__init__.py
+++ b/rllib/core/rl_module/torch/__init__.py
@@ -1,3 +1,3 @@
-from .torch_rl_module import TorchDDPRLModule
+from .torch_rl_module import TorchRLModule
 
-__all__ = ["TorchDDPRLModule"]
+__all__ = ["TorchRLModule"]
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index e30aa2cb6a68..998d6ae1c6e9 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -124,7 +124,7 @@ def _make_distributed_module(self) -> MultiAgentRLModule:
         else:
             for key in module.keys():
                 module[key].to(self._device)
-                module[key] = TorchDDPRLModule(module[key])
+                module.add_module(key, TorchDDPRLModule(module[key]), override=True)
 
         return module
 
@@ -182,4 +182,4 @@ def add_module(
         # we need to ddpify the module that was just added to the pool
         self._module[module_id].to(self._device)
         if self.distributed:
-            self._module[module_id] = TorchDDPRLModule(self._module[module_id])
+            self._module.add_module(module_id, TorchDDPRLModule(self._module[module_id]), override=True)

From 3302db7359a62266e6c702dfe4ebabceeec2f72b Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 15:24:55 -0800
Subject: [PATCH 027/112] fixed the import issue

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/torch/torch_rl_module.py | 11 ++---------
 rllib/utils/framework.py                      | 14 ++++++++++----
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py
index 8451800af393..fad5a29581c3 100644
--- a/rllib/core/rl_module/torch/torch_rl_module.py
+++ b/rllib/core/rl_module/torch/torch_rl_module.py
@@ -4,13 +4,6 @@
 from ray.rllib.core.rl_module import RLModule
 
 torch, nn = try_import_torch()
-if torch:
-    from torch.nn.parallel import DistributedDataParallel as DDP
-else:
-    raise RuntimeError(
-        "Torch is not installed. Please install torch or do pip install ray[rllib]."
-    )
-
 
 class TorchRLModule(nn.Module, RLModule):
     def __init__(self, *args, **kwargs) -> None:
@@ -47,9 +40,9 @@ def is_distributed(self) -> bool:
         return False
 
 
-class TorchDDPRLModule(DDP, RLModule):
+class TorchDDPRLModule(nn.parallel.DistributedDataParallel, RLModule):
     def __init__(self, *args, **kwargs) -> None:
-        DDP.__init__(self, *args, **kwargs)
+        nn.parallel.DistributedDataParallel.__init__(self, *args, **kwargs)
         # we do not want to call RLModule.__init__ here because it will all we need is
         # the interface of that base-class not the actual implementation.
 
diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py
index a52293d0721b..d85b62fe1ff5 100644
--- a/rllib/utils/framework.py
+++ b/rllib/utils/framework.py
@@ -157,13 +157,19 @@ class _NNStub:
     def __init__(self, *a, **kw):
         # Fake nn.functional module within torch.nn.
         self.functional = None
-        self.Module = _ModuleStub
+        self.Module = _FakeClassStub
+        self.parallel = _ParallelStub()
 
 
-# Fake class for torch.nn.Module to allow it to be inherited from.
-class _ModuleStub:
+# Fake class for e.g. torch.nn.Module to allow it to be inherited from.
+class _FakeClassStub:
     def __init__(self, *a, **kw):
-        raise ImportError("Could not import `torch`.")
+        raise ImportError("Could not import `torch`. Try pip install torch.")
+
+class _ParallelStub:
+    def __init__(self, *a, **kw):
+        self.DataParallel = _FakeClassStub
+        self.DistributedDataParallel = _FakeClassStub
 
 
 @PublicAPI

From 5455e2903662dad8b2c0ae158af5e7995d40488f Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 15:25:27 -0800
Subject: [PATCH 028/112] lint

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/torch/torch_rl_module.py   | 1 +
 rllib/core/rl_trainer/torch/torch_rl_trainer.py | 4 +++-
 rllib/utils/framework.py                        | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py
index fad5a29581c3..39f15c79cc51 100644
--- a/rllib/core/rl_module/torch/torch_rl_module.py
+++ b/rllib/core/rl_module/torch/torch_rl_module.py
@@ -5,6 +5,7 @@
 
 torch, nn = try_import_torch()
 
+
 class TorchRLModule(nn.Module, RLModule):
     def __init__(self, *args, **kwargs) -> None:
         nn.Module.__init__(self)
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 998d6ae1c6e9..38cad6d7602b 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -182,4 +182,6 @@ def add_module(
         # we need to ddpify the module that was just added to the pool
         self._module[module_id].to(self._device)
         if self.distributed:
-            self._module.add_module(module_id, TorchDDPRLModule(self._module[module_id]), override=True)
+            self._module.add_module(
+                module_id, TorchDDPRLModule(self._module[module_id]), override=True
+            )
diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py
index d85b62fe1ff5..7ae4a4c5ddfc 100644
--- a/rllib/utils/framework.py
+++ b/rllib/utils/framework.py
@@ -166,6 +166,7 @@ class _FakeClassStub:
     def __init__(self, *a, **kw):
         raise ImportError("Could not import `torch`. Try pip install torch.")
 
+
 class _ParallelStub:
     def __init__(self, *a, **kw):
         self.DataParallel = _FakeClassStub

From b9159a8f45f511f20619bf58b00d2fd3103e8efe Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 18 Jan 2023 17:38:15 -0800
Subject: [PATCH 029/112] fixed lint

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/tests/torch/__init__.py                    | 0
 rllib/core/rl_trainer/tests/{ => torch}/test_torch_rl_trainer.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 rllib/core/rl_trainer/tests/torch/__init__.py
 rename rllib/core/rl_trainer/tests/{ => torch}/test_torch_rl_trainer.py (100%)

diff --git a/rllib/core/rl_trainer/tests/torch/__init__.py b/rllib/core/rl_trainer/tests/torch/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rllib/core/rl_trainer/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py
similarity index 100%
rename from rllib/core/rl_trainer/tests/test_torch_rl_trainer.py
rename to rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py

From 2aec198390fe54409da42f177ac979c30e2914c4 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 11:00:24 -0800
Subject: [PATCH 030/112] test trainer runner updated

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../rl_trainer/tests/test_trainer_runner.py   | 79 +---------------
 .../tests/test_trainer_runner_local.py        |  7 +-
 rllib/core/testing/__init__.py                |  0
 rllib/core/testing/utils.py                   | 92 +++++++++++++++++++
 4 files changed, 99 insertions(+), 79 deletions(-)
 create mode 100644 rllib/core/testing/__init__.py
 create mode 100644 rllib/core/testing/utils.py

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 45b48f476629..d9e85896f49f 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -1,73 +1,11 @@
 import gymnasium as gym
 import unittest
-
-import tensorflow as tf
-import torch
 import ray
 import time
 
-from typing import Type, Union
-
-from ray.rllib.core.rl_module import RLModule
-from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
-from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
-
-Optimizer = Union[tf.keras.optimizers.Optimizer, torch.optim.Optimizer]
-
-
-def _get_trainer_class(framework: str) -> Type[RLTrainer]:
-    if framework == "tf":
-        from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
-
-        return BCTfRLTrainer
-    elif framework == "torch":
-        from ray.rllib.core.testing.torch.bc_rl_trainer import BCTorchRLTrainer
-
-        return BCTorchRLTrainer
-    else:
-        raise ValueError(f"Unsupported framework: {framework}")
-
-
-def _get_module_class(framework: str) -> Type[RLModule]:
-    if framework == "tf":
-        from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
-
-        return DiscreteBCTFModule
-    elif framework == "torch":
-        from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
-
-        return DiscreteBCTorchModule
-    else:
-        raise ValueError(f"Unsupported framework: {framework}")
-
-
-def _get_optimizer_default_class(framework: str) -> Type[Optimizer]:
-    if framework == "tf":
-        return tf.keras.optimizers.Adam
-    elif framework == "torch":
-        return torch.optim.Adam
-    else:
-        raise ValueError(f"Unsupported framework: {framework}")
-
-
-def _get_trainer_runner(
-    framework: str, env: gym.Env, compute_config: dict
-) -> TrainerRunner:
-    trainer_class = _get_trainer_class(framework)
-    trainer_cfg = dict(
-        module_class=_get_module_class(framework),
-        module_kwargs={
-            "observation_space": env.observation_space,
-            "action_space": env.action_space,
-            "model_config": {"hidden_dim": 32},
-        },
-        optimizer_config={"lr": 0.1},
-    )
-    runner = TrainerRunner(trainer_class, trainer_cfg, compute_config=compute_config)
-
-    return runner
+from ray.rllib.core.testing.utils import get_trainer_runner, add_module_to_runner_or_trainer
 
 
 class TestTrainerRunner(unittest.TestCase):
@@ -91,7 +29,7 @@ def test_update_multigpu(self):
             ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
-            runner = _get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
+            runner = get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
             reader = get_cartpole_dataset_reader(batch_size=500)
 
             min_loss = float("inf")
@@ -123,7 +61,7 @@ def test_add_remove_module(self):
             ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
-            runner = _get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
+            runner = get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
             reader = get_cartpole_dataset_reader(batch_size=500)
             batch = reader.next()
 
@@ -133,16 +71,7 @@ def test_add_remove_module(self):
             new_module_id = "test_module"
 
             # add a test_module
-            runner.add_module(
-                module_id=new_module_id,
-                module_cls=_get_module_class(fw),
-                module_kwargs={
-                    "observation_space": env.observation_space,
-                    "action_space": env.action_space,
-                    "model_config": {"hidden_dim": 32},
-                },
-                optimizer_cls=_get_optimizer_default_class(fw),
-            )
+            add_module_to_runner_or_trainer(fw, env, new_module_id, runner)
 
             # do training that includes the test_module
             results = runner.update(
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
index 224bbde403a8..6f6158dbcb4d 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
@@ -5,11 +5,11 @@
 import ray
 
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
-from ray.rllib.core.rl_trainer.tests.test_trainer_runner import add_module_helper
 from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
+from ray.rllib.core.testing.utils import add_module_to_runner_or_trainer
 
 
 tf1, tf, tfv = try_import_tf()
@@ -59,9 +59,8 @@ def test_trainer_runner_no_gpus(self):
 
         new_module_id = "test_module"
 
-        # add a test_module
-        add_module_helper(env, new_module_id, runner)
-        add_module_helper(env, new_module_id, local_trainer)
+        add_module_to_runner_or_trainer("tf", env, new_module_id, runner)
+        add_module_to_runner_or_trainer("tf", env, new_module_id, local_trainer)
 
         # make the state of the trainer and the local runner identical
         local_trainer.set_state(runner.get_state()[0])
diff --git a/rllib/core/testing/__init__.py b/rllib/core/testing/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
new file mode 100644
index 000000000000..54ad4e73c875
--- /dev/null
+++ b/rllib/core/testing/utils.py
@@ -0,0 +1,92 @@
+from typing import Type, Union, TYPE_CHECKING
+
+import torch
+import tf
+
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
+
+if TYPE_CHECKING:
+    import gymnasium as gym
+
+    from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
+    from ray.rllib.core.rl_module import RLModule
+
+
+Optimizer = Union[tf.keras.optimizers.Optimizer, torch.optim.Optimizer]
+
+
+@DeveloperAPI
+def get_trainer_class(framework: str) -> Type["RLTrainer"]:
+    if framework == "tf":
+        from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
+
+        return BCTfRLTrainer
+    elif framework == "torch":
+        from ray.rllib.core.testing.torch.bc_rl_trainer import BCTorchRLTrainer
+
+        return BCTorchRLTrainer
+    else:
+        raise ValueError(f"Unsupported framework: {framework}")
+
+
+@DeveloperAPI
+def get_module_class(framework: str) -> Type["RLModule"]:
+    if framework == "tf":
+        from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
+
+        return DiscreteBCTFModule
+    elif framework == "torch":
+        from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
+
+        return DiscreteBCTorchModule
+    else:
+        raise ValueError(f"Unsupported framework: {framework}")
+
+
+@DeveloperAPI
+def get_optimizer_default_class(framework: str) -> Type[Optimizer]:
+    if framework == "tf":
+        return tf.keras.optimizers.Adam
+    elif framework == "torch":
+        return torch.optim.Adam
+    else:
+        raise ValueError(f"Unsupported framework: {framework}")
+
+
+@DeveloperAPI
+def get_trainer_runner(
+    framework: str, env: "gym.Env", compute_config: dict
+) -> TrainerRunner:
+    trainer_class = get_trainer_class(framework)
+    trainer_cfg = dict(
+        module_class=get_module_class(framework),
+        module_kwargs={
+            "observation_space": env.observation_space,
+            "action_space": env.action_space,
+            "model_config": {"hidden_dim": 32},
+        },
+        optimizer_config={"lr": 0.1},
+    )
+    runner = TrainerRunner(trainer_class, trainer_cfg, compute_config=compute_config)
+
+    return runner
+
+
+@DeveloperAPI
+def add_module_to_runner_or_trainer(
+    framework: str,
+    env: "gym.Env",
+    module_id: str,
+    runner_or_trainer: Union[TrainerRunner, "RLTrainer"],
+):
+    runner_or_trainer.add_module(
+        module_id=module_id,
+        module_cls=get_module_class(framework),
+        module_kwargs={
+            "observation_space": env.observation_space,
+            "action_space": env.action_space,
+            "model_config": {"hidden_dim": 32},
+        },
+        optimizer_cls=get_optimizer_default_class(framework),
+    )

From 873cdd5bcd917b3246b7b4c03f56578d427f36cf Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 12:22:10 -0800
Subject: [PATCH 031/112] fixed the scaling config and in_test issues
 introduced after the merge.

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py                 |  3 +++
 rllib/core/rl_trainer/tests/test_trainer_runner.py  |  5 ++++-
 .../rl_trainer/tests/test_trainer_runner_local.py   |  1 -
 .../rl_trainer/tests/torch/test_torch_rl_trainer.py | 12 +++++++-----
 rllib/core/rl_trainer/tf/tf_rl_trainer.py           |  5 ++++-
 rllib/core/rl_trainer/torch/torch_rl_trainer.py     | 13 +++++++++----
 rllib/core/rl_trainer/trainer_runner.py             |  5 +++--
 rllib/core/testing/utils.py                         |  2 +-
 8 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 13914c2ffad6..543a6b403ba2 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -24,6 +24,8 @@
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import TensorType
 
+from ray.air.config import ScalingConfig
+
 torch, _ = try_import_torch()
 tf1, tf, tfv = try_import_tf()
 
@@ -103,6 +105,7 @@ def __init__(
         module_kwargs: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
+        scaling_config: Optional[ScalingConfig] = None,
     ):
         # TODO (Kourosh): convert scaling and optimizer configs to dataclasses
         self.module_class = module_class
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index d9e85896f49f..5743c891b4c4 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -5,7 +5,10 @@
 
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
-from ray.rllib.core.testing.utils import get_trainer_runner, add_module_to_runner_or_trainer
+from ray.rllib.core.testing.utils import (
+    get_trainer_runner,
+    add_module_to_runner_or_trainer,
+)
 
 
 class TestTrainerRunner(unittest.TestCase):
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
index 6f6158dbcb4d..81b03b3a1ab4 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
@@ -40,7 +40,6 @@ def test_trainer_runner_no_gpus(self):
                 "model_config": {"hidden_dim": 32},
             },
             optimizer_config={"lr": 1e-3},
-            in_test=True,
         )
         runner = TrainerRunner(
             trainer_class, trainer_cfg, compute_config=dict(num_gpus=0)
diff --git a/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py b/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py
index 682e76589b7a..04835b86dbf5 100644
--- a/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py
@@ -12,10 +12,12 @@
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
 from ray.rllib.utils.numpy import convert_to_numpy
 
+from ray.air.config import ScalingConfig
+
 
 def _get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
     env = gym.make("CartPole-v1")
-    scaling_config = {} or scaling_config
+    scaling_config = scaling_config or ScalingConfig()
     distributed = False
 
     # TODO: Another way to make RLTrainer would be to construct the module first
@@ -51,7 +53,7 @@ def tearDown(cls) -> None:
 
     def test_end_to_end_update(self):
 
-        trainer = _get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
         reader = get_cartpole_dataset_reader(batch_size=512)
 
         min_loss = float("inf")
@@ -74,7 +76,7 @@ def test_compute_gradients(self):
         Tests that if we sum all the trainable variables the gradient of output w.r.t.
         the weights is all ones.
         """
-        trainer = _get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
 
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
         loss = {"total_loss": sum([param.sum() for param in params])}
@@ -93,7 +95,7 @@ def test_apply_gradients(self):
         standard SGD/Adam update rule.
         """
 
-        trainer = _get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
 
         # calculated the expected new params based on gradients of all ones.
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
@@ -117,7 +119,7 @@ def test_add_remove_module(self):
         all variables the updated parameters follow the SGD update rule.
         """
         env = gym.make("CartPole-v1")
-        trainer = _get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
 
         # add a test module with SGD optimizer with a known lr
         lr = 1e-4
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 2122625002c7..5d4c0f9c07fd 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -27,6 +27,8 @@
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
 
+from ray.air.config import ScalingConfig
+
 tf1, tf, tfv = try_import_tf()
 tf1.enable_eager_execution()
 
@@ -41,7 +43,6 @@ class TfRLTrainer(RLTrainer):
         module_kwargs: The kwargs for the (MA)RLModule.
         optimizer_config: The config for the optimizer.
         distributed: Whether this trainer is distributed or not.
-        in_test: Whether to enable additional logging behavior for testing purposes.
         enable_tf_function: Whether to enable tf.function tracing for the update
             function.
 
@@ -87,6 +88,7 @@ def __init__(
         module_kwargs: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
+        scaling_config: Optional[ScalingConfig] = None,
         enable_tf_function: bool = True,
     ):
         super().__init__(
@@ -94,6 +96,7 @@ def __init__(
             module_kwargs=module_kwargs,
             optimizer_config=optimizer_config,
             distributed=distributed,
+            scaling_config=scaling_config,
         )
 
         self._enable_tf_function = enable_tf_function
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 38cad6d7602b..4300f500d48f 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -11,6 +11,7 @@
 )
 import torch
 
+from ray.air.config import ScalingConfig
 from ray.train.torch.train_loop_utils import _TorchAccelerator
 
 from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
@@ -36,20 +37,24 @@ def __init__(
         self,
         module_class: Union[Type[RLModule], Type[MultiAgentRLModule]],
         module_kwargs: Mapping[str, Any],
-        scaling_config: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
+        scaling_config: Optional[Mapping[str, Any]] = None,
     ):
         super().__init__(
             module_class=module_class,
             module_kwargs=module_kwargs,
-            scaling_config=scaling_config,
             optimizer_config=optimizer_config,
             distributed=distributed,
+            scaling_config=scaling_config,
         )
 
-        self._world_size = scaling_config.get("num_workers", 1)
-        self._use_gpu = scaling_config.get("use_gpu", False)
+        # TODO (Kourosh): Scaling config is required for torch trainer to do proper DDP
+        # wraping setup but not so much required for tf. we need to
+        scaling_config = scaling_config or ScalingConfig()
+        self._world_size = scaling_config.num_workers or 1
+        self._use_gpu = scaling_config.use_gpu
+
         # These attributes are set in the `build` method.
         self._device = None
 
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 4db6fbc1cade..c220dc99a1d6 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -77,9 +77,10 @@ def __init__(
                 max_retries=0,
             )
 
-            # TODO(avnishn, kourosh): let's not pass this into the config which will
-            # cause information leakage into the RLTrainer about other workers.
+            # TODO(avnishn, kourosh): Should we pass in scaling config into the
+            # trainer?
             trainer_config["distributed"] = self._distributed = bool(num_gpus > 1)
+            trainer_config["scaling_config"] = scaling_config
             self.backend_executor.start(
                 train_cls=trainer_class, train_cls_kwargs=trainer_config
             )
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 54ad4e73c875..76f86c835c3b 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -1,7 +1,7 @@
 from typing import Type, Union, TYPE_CHECKING
 
 import torch
-import tf
+import tensorflow as tf
 
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner

From 13e19aa1b28bacf8e6917188b526a820a901a7bb Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 12:22:10 -0800
Subject: [PATCH 032/112] fixed the scaling config and in_test issues
 introduced after the merge.

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py              |  6 +++++-
 .../core/rl_trainer/tests/test_trainer_runner.py |  5 ++++-
 .../tests/test_trainer_runner_local.py           |  1 -
 .../tests/torch/test_torch_rl_trainer.py         | 12 +++++++-----
 rllib/core/rl_trainer/tf/tf_rl_trainer.py        |  7 +++++--
 rllib/core/rl_trainer/torch/torch_rl_trainer.py  | 16 ++++++++++++----
 rllib/core/rl_trainer/trainer_runner.py          | 12 ++++++------
 rllib/core/testing/utils.py                      |  2 +-
 8 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 13914c2ffad6..085416ca940c 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -24,6 +24,8 @@
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import TensorType
 
+from ray.air.config import ScalingConfig
+
 torch, _ = try_import_torch()
 tf1, tf, tfv = try_import_tf()
 
@@ -95,7 +97,8 @@ class RLTrainer:
 
     """
 
-    TOTAL_LOSS_KEY = "total_loss"
+    framework: str = None
+    TOTAL_LOSS_KEY: str = "total_loss"
 
     def __init__(
         self,
@@ -103,6 +106,7 @@ def __init__(
         module_kwargs: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
+        scaling_config: Optional[ScalingConfig] = None,
     ):
         # TODO (Kourosh): convert scaling and optimizer configs to dataclasses
         self.module_class = module_class
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index d9e85896f49f..5743c891b4c4 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -5,7 +5,10 @@
 
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
-from ray.rllib.core.testing.utils import get_trainer_runner, add_module_to_runner_or_trainer
+from ray.rllib.core.testing.utils import (
+    get_trainer_runner,
+    add_module_to_runner_or_trainer,
+)
 
 
 class TestTrainerRunner(unittest.TestCase):
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
index 6f6158dbcb4d..81b03b3a1ab4 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
@@ -40,7 +40,6 @@ def test_trainer_runner_no_gpus(self):
                 "model_config": {"hidden_dim": 32},
             },
             optimizer_config={"lr": 1e-3},
-            in_test=True,
         )
         runner = TrainerRunner(
             trainer_class, trainer_cfg, compute_config=dict(num_gpus=0)
diff --git a/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py b/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py
index 682e76589b7a..04835b86dbf5 100644
--- a/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py
@@ -12,10 +12,12 @@
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
 from ray.rllib.utils.numpy import convert_to_numpy
 
+from ray.air.config import ScalingConfig
+
 
 def _get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
     env = gym.make("CartPole-v1")
-    scaling_config = {} or scaling_config
+    scaling_config = scaling_config or ScalingConfig()
     distributed = False
 
     # TODO: Another way to make RLTrainer would be to construct the module first
@@ -51,7 +53,7 @@ def tearDown(cls) -> None:
 
     def test_end_to_end_update(self):
 
-        trainer = _get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
         reader = get_cartpole_dataset_reader(batch_size=512)
 
         min_loss = float("inf")
@@ -74,7 +76,7 @@ def test_compute_gradients(self):
         Tests that if we sum all the trainable variables the gradient of output w.r.t.
         the weights is all ones.
         """
-        trainer = _get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
 
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
         loss = {"total_loss": sum([param.sum() for param in params])}
@@ -93,7 +95,7 @@ def test_apply_gradients(self):
         standard SGD/Adam update rule.
         """
 
-        trainer = _get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
 
         # calculated the expected new params based on gradients of all ones.
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
@@ -117,7 +119,7 @@ def test_add_remove_module(self):
         all variables the updated parameters follow the SGD update rule.
         """
         env = gym.make("CartPole-v1")
-        trainer = _get_trainer(scaling_config={"num_workers": 2})
+        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
 
         # add a test module with SGD optimizer with a known lr
         lr = 1e-4
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 2122625002c7..031296ae85bd 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -27,6 +27,8 @@
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
 
+from ray.air.config import ScalingConfig
+
 tf1, tf, tfv = try_import_tf()
 tf1.enable_eager_execution()
 
@@ -41,7 +43,6 @@ class TfRLTrainer(RLTrainer):
         module_kwargs: The kwargs for the (MA)RLModule.
         optimizer_config: The config for the optimizer.
         distributed: Whether this trainer is distributed or not.
-        in_test: Whether to enable additional logging behavior for testing purposes.
         enable_tf_function: Whether to enable tf.function tracing for the update
             function.
 
@@ -79,7 +80,7 @@ class TfRLTrainer(RLTrainer):
 
     """
 
-    TOTAL_LOSS_KEY = "total_loss"
+    framework: str = "tf"
 
     def __init__(
         self,
@@ -87,6 +88,7 @@ def __init__(
         module_kwargs: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
+        scaling_config: Optional[ScalingConfig] = None,
         enable_tf_function: bool = True,
     ):
         super().__init__(
@@ -94,6 +96,7 @@ def __init__(
             module_kwargs=module_kwargs,
             optimizer_config=optimizer_config,
             distributed=distributed,
+            scaling_config=scaling_config,
         )
 
         self._enable_tf_function = enable_tf_function
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 38cad6d7602b..695b29b2891e 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -11,6 +11,7 @@
 )
 import torch
 
+from ray.air.config import ScalingConfig
 from ray.train.torch.train_loop_utils import _TorchAccelerator
 
 from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
@@ -32,24 +33,31 @@
 
 
 class TorchRLTrainer(RLTrainer):
+
+    framework: str = "torch"
+
     def __init__(
         self,
         module_class: Union[Type[RLModule], Type[MultiAgentRLModule]],
         module_kwargs: Mapping[str, Any],
-        scaling_config: Mapping[str, Any],
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
+        scaling_config: Optional[Mapping[str, Any]] = None,
     ):
         super().__init__(
             module_class=module_class,
             module_kwargs=module_kwargs,
-            scaling_config=scaling_config,
             optimizer_config=optimizer_config,
             distributed=distributed,
+            scaling_config=scaling_config,
         )
 
-        self._world_size = scaling_config.get("num_workers", 1)
-        self._use_gpu = scaling_config.get("use_gpu", False)
+        # TODO (Kourosh): Scaling config is required for torch trainer to do proper DDP
+        # wraping setup but not so much required for tf. we need to
+        scaling_config = scaling_config or ScalingConfig()
+        self._world_size = scaling_config.num_workers or 1
+        self._use_gpu = scaling_config.use_gpu
+
         # These attributes are set in the `build` method.
         self._device = None
 
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 4db6fbc1cade..f19cb597fea3 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -9,8 +9,7 @@
     ParamOptimizerPairs,
     Optimizer,
 )
-from ray.rllib.core.rl_trainer.tf.tf_rl_trainer import TfRLTrainer
-from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
+
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 
 
@@ -58,11 +57,11 @@ def __init__(
                 use_gpu=(not use_fake_gpus),
             )
 
-            if issubclass(trainer_class, TorchRLTrainer):
+            if trainer_class.framework == "torch":
                 from ray.train.torch import TorchConfig
 
                 backend_config = TorchConfig()
-            elif issubclass(trainer_class, TfRLTrainer):
+            elif trainer_class.framework == "tf":
                 from ray.train.tensorflow import TensorflowConfig
 
                 backend_config = TensorflowConfig()
@@ -77,9 +76,10 @@ def __init__(
                 max_retries=0,
             )
 
-            # TODO(avnishn, kourosh): let's not pass this into the config which will
-            # cause information leakage into the RLTrainer about other workers.
+            # TODO(avnishn, kourosh): Should we pass in scaling config into the
+            # trainer?
             trainer_config["distributed"] = self._distributed = bool(num_gpus > 1)
+            trainer_config["scaling_config"] = scaling_config
             self.backend_executor.start(
                 train_cls=trainer_class, train_cls_kwargs=trainer_config
             )
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 54ad4e73c875..76f86c835c3b 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -1,7 +1,7 @@
 from typing import Type, Union, TYPE_CHECKING
 
 import torch
-import tf
+import tensorflow as tf
 
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner

From ca3e22531f78f7da7bc0b3f087bb7c5855f729cf Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 16:17:39 -0800
Subject: [PATCH 033/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../ppo/torch/ppo_torch_rl_trainer.py         | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
new file mode 100644
index 000000000000..12136aae4617
--- /dev/null
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -0,0 +1,136 @@
+
+from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.torch_utils import warn_if_infinite_kl_divergence
+
+class PPOTorchRLTrainer(TorchRLTrainer):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO (Kourosh): Move these failures to config.validate() or support them.
+        if self.config.entropy_coeff_schedule: 
+            raise ValueError("entropy_coeff_schedule is not supported in RLTrainer yet")
+        
+        if self.config.lr_schedule:
+            raise ValueError("lr_schedule is not supported in RLTrainer yet")
+        
+        # TODO (Kourosh): We can still use mix-ins in the new design. Do we want that? 
+        # Most likely not.
+        self.kl_coeff = self.config.kl_coeff
+        self.kl_target = self.config.kl_target
+
+    def compute_loss(self, *, fwd_out: MultiAgentBatch, batch: MultiAgentBatch) -> Union[TensorType, Mapping[str, Any]]:
+        
+        # TODO (Kourosh): come back to RNNs later
+        # TODO (Kourosh): This is boiler plate code. Can we minimize this? 
+        """
+        loss_dict = {}
+        loss_total = None
+        for module_id in fwd_out:
+
+            loss = ...
+
+            
+            if loss_total is None:
+                loss_total = loss
+            else:
+                loss_total += loss
+
+        
+        loss_dict[self.TOTAL_LOSS_KEY] = loss_total
+        """
+
+        loss_dict = {}
+        loss_total = None
+        for module_id in fwd_out:
+            module_batch = batch[module_id]
+            module_fwd_out = fwd_out[module_id]
+
+            curr_action_dist = fwd_out[SampleBatch.ACTION_DIST]
+            action_dist_class = type(module_fwd_out[SampleBatch.ACTION_DIST])
+            prev_action_dist = action_dist_class(
+                **module_batch[SampleBatch.ACTION_DIST_INPUTS]
+            )
+
+            logp_ratio = torch.exp(
+            module_fwd_out[SampleBatch.ACTION_LOGP] - module_batch[SampleBatch.ACTION_LOGP]
+            )
+
+            # Only calculate kl loss if necessary (kl-coeff > 0.0).
+            if self.config.kl_coeff > 0.0:
+                action_kl = prev_action_dist.kl(curr_action_dist)
+                mean_kl_loss = torch.mean(action_kl)
+                # TODO smorad: should we do anything besides warn? Could discard KL term
+                # for this update
+                warn_if_infinite_kl_divergence(self, mean_kl_loss)
+            else:
+                mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)
+
+            curr_entropy = module_fwd_out["entropy"]
+            mean_entropy = torch.mean(curr_entropy)
+
+            surrogate_loss = torch.min(
+                module_batch[Postprocessing.ADVANTAGES] * logp_ratio,
+                module_batch[Postprocessing.ADVANTAGES]
+                * torch.clamp(
+                    logp_ratio, 1 - self.config.clip_param, 1 + self.config.clip_param
+                ),
+            )
+
+
+            # Compute a value function loss.
+            if self.config.use_critic:
+                value_fn_out = module_fwd_out[SampleBatch.VF_PREDS]
+                vf_loss = torch.pow(
+                    value_fn_out - module_batch[Postprocessing.VALUE_TARGETS], 2.0
+                )
+                vf_loss_clipped = torch.clamp(vf_loss, 0, self.config.vf_clip_param)
+                mean_vf_loss = torch.mean(vf_loss_clipped)
+            # Ignore the value function.
+            else:
+                value_fn_out = torch.tensor(0.0).to(surrogate_loss.device)
+                vf_loss_clipped = mean_vf_loss = torch.tensor(0.0).to(surrogate_loss.device)
+
+
+            total_loss = torch.mean(
+                -surrogate_loss
+                + self.config.vf_loss_coeff * vf_loss_clipped
+                - self.config.entropy_coeff * curr_entropy
+            )
+            
+
+            # Add mean_kl_loss (already processed through `reduce_mean_valid`),
+            # if necessary.
+            if self.config.kl_coeff > 0.0:
+                total_loss += self.config. * mean_kl_loss
+
+            loss = ...
+
+            
+            if loss_total is None:
+                loss_total = loss
+            else:
+                loss_total += loss
+
+        
+        loss_dict[self.TOTAL_LOSS_KEY] = loss_total
+
+        return loss_dict
+
+
+    def additional_update(self, *, sampled_kl_values) -> Mapping[str, Any]:
+        
+        results = {}
+        for module_id in sampled_kl_values:
+            sampled_kl = sampled_kl_values[module_id]
+            if sampled_kl > 2.0 * self.kl_target:
+                # TODO (Kourosh) why not 2? 
+                self.kl_coeff *= 1.5
+            elif sampled_kl < 0.5 * self.kl_target:
+                self.kl_coeff *= 0.5
+            
+            results[module_id] = {"kl_coeff": self.kl_coeff}
+
+        return results
+        

From 3234aaf097880ac625bcf342b6223d66a497c958 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 16:44:28 -0800
Subject: [PATCH 034/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../ppo/torch/ppo_torch_rl_trainer.py         | 190 +++++++++---------
 1 file changed, 96 insertions(+), 94 deletions(-)

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index 12136aae4617..e0e7c8a54ee8 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -1,136 +1,138 @@
+from typing import Mapping, Any, Union
+from collections import defaultdict
 
 from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.torch_utils import warn_if_infinite_kl_divergence
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.torch_utils import (
+    warn_if_infinite_kl_divergence,
+    explained_variance,
+)
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
 
-class PPOTorchRLTrainer(TorchRLTrainer):
 
+class PPOTorchRLTrainer(TorchRLTrainer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO (Kourosh): Move these failures to config.validate() or support them.
-        if self.config.entropy_coeff_schedule: 
+        if self.config.entropy_coeff_schedule:
             raise ValueError("entropy_coeff_schedule is not supported in RLTrainer yet")
-        
+
         if self.config.lr_schedule:
             raise ValueError("lr_schedule is not supported in RLTrainer yet")
-        
-        # TODO (Kourosh): We can still use mix-ins in the new design. Do we want that? 
+
+        # TODO (Kourosh): We can still use mix-ins in the new design. Do we want that?
         # Most likely not.
         self.kl_coeff = self.config.kl_coeff
         self.kl_target = self.config.kl_target
 
-    def compute_loss(self, *, fwd_out: MultiAgentBatch, batch: MultiAgentBatch) -> Union[TensorType, Mapping[str, Any]]:
-        
-        # TODO (Kourosh): come back to RNNs later
-        # TODO (Kourosh): This is boiler plate code. Can we minimize this? 
-        """
-        loss_dict = {}
-        loss_total = None
-        for module_id in fwd_out:
-
-            loss = ...
-
-            
-            if loss_total is None:
-                loss_total = loss
-            else:
-                loss_total += loss
+    def compute_loss(
+        self, *, fwd_out: MultiAgentBatch, batch: MultiAgentBatch
+    ) -> Union[TensorType, Mapping[str, Any]]:
 
-        
-        loss_dict[self.TOTAL_LOSS_KEY] = loss_total
-        """
+        # TODO (Kourosh): This is boiler plate code. Can we minimize this?
 
-        loss_dict = {}
         loss_total = None
+        results_all_agents = defaultdict(dict)
         for module_id in fwd_out:
             module_batch = batch[module_id]
             module_fwd_out = fwd_out[module_id]
 
-            curr_action_dist = fwd_out[SampleBatch.ACTION_DIST]
-            action_dist_class = type(module_fwd_out[SampleBatch.ACTION_DIST])
-            prev_action_dist = action_dist_class(
-                **module_batch[SampleBatch.ACTION_DIST_INPUTS]
-            )
-
-            logp_ratio = torch.exp(
-            module_fwd_out[SampleBatch.ACTION_LOGP] - module_batch[SampleBatch.ACTION_LOGP]
-            )
-
-            # Only calculate kl loss if necessary (kl-coeff > 0.0).
-            if self.config.kl_coeff > 0.0:
-                action_kl = prev_action_dist.kl(curr_action_dist)
-                mean_kl_loss = torch.mean(action_kl)
-                # TODO smorad: should we do anything besides warn? Could discard KL term
-                # for this update
-                warn_if_infinite_kl_divergence(self, mean_kl_loss)
-            else:
-                mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)
-
-            curr_entropy = module_fwd_out["entropy"]
-            mean_entropy = torch.mean(curr_entropy)
-
-            surrogate_loss = torch.min(
-                module_batch[Postprocessing.ADVANTAGES] * logp_ratio,
-                module_batch[Postprocessing.ADVANTAGES]
-                * torch.clamp(
-                    logp_ratio, 1 - self.config.clip_param, 1 + self.config.clip_param
-                ),
-            )
-
-
-            # Compute a value function loss.
-            if self.config.use_critic:
-                value_fn_out = module_fwd_out[SampleBatch.VF_PREDS]
-                vf_loss = torch.pow(
-                    value_fn_out - module_batch[Postprocessing.VALUE_TARGETS], 2.0
-                )
-                vf_loss_clipped = torch.clamp(vf_loss, 0, self.config.vf_clip_param)
-                mean_vf_loss = torch.mean(vf_loss_clipped)
-            # Ignore the value function.
-            else:
-                value_fn_out = torch.tensor(0.0).to(surrogate_loss.device)
-                vf_loss_clipped = mean_vf_loss = torch.tensor(0.0).to(surrogate_loss.device)
-
-
-            total_loss = torch.mean(
-                -surrogate_loss
-                + self.config.vf_loss_coeff * vf_loss_clipped
-                - self.config.entropy_coeff * curr_entropy
-            )
-            
+            module_results = self._compute_loss(module_batch, module_fwd_out)
+            results_all_agents[module_id] = module_results
+            loss = module_results[self.TOTAL_LOSS_KEY]
 
-            # Add mean_kl_loss (already processed through `reduce_mean_valid`),
-            # if necessary.
-            if self.config.kl_coeff > 0.0:
-                total_loss += self.config. * mean_kl_loss
-
-            loss = ...
-
-            
             if loss_total is None:
                 loss_total = loss
             else:
                 loss_total += loss
 
-        
-        loss_dict[self.TOTAL_LOSS_KEY] = loss_total
+        results_all_agents[self.TOTAL_LOSS_KEY] = loss_total
+
+        return results_all_agents
 
-        return loss_dict
+    def _compute_loss(
+        self, batch: SampleBatch, fwd_out: Mapping[str, TensorType]
+    ) -> TensorType:
+        # TODO (Kourosh): come back to RNNs later
 
+        curr_action_dist = fwd_out[SampleBatch.ACTION_DIST]
+        action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST])
+        prev_action_dist = action_dist_class(**batch[SampleBatch.ACTION_DIST_INPUTS])
+
+        logp_ratio = torch.exp(
+            fwd_out[SampleBatch.ACTION_LOGP] - batch[SampleBatch.ACTION_LOGP]
+        )
+
+        # Only calculate kl loss if necessary (kl-coeff > 0.0).
+        if self.config.kl_coeff > 0.0:
+            action_kl = prev_action_dist.kl(curr_action_dist)
+            mean_kl_loss = torch.mean(action_kl)
+            # TODO smorad: should we do anything besides warn? Could discard KL term
+            # for this update
+            warn_if_infinite_kl_divergence(self, mean_kl_loss)
+        else:
+            mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)
+
+        curr_entropy = fwd_out["entropy"]
+        mean_entropy = torch.mean(curr_entropy)
+
+        surrogate_loss = torch.min(
+            batch[Postprocessing.ADVANTAGES] * logp_ratio,
+            batch[Postprocessing.ADVANTAGES]
+            * torch.clamp(
+                logp_ratio, 1 - self.config.clip_param, 1 + self.config.clip_param
+            ),
+        )
+
+        # Compute a value function loss.
+        if self.config.use_critic:
+            value_fn_out = fwd_out[SampleBatch.VF_PREDS]
+            vf_loss = torch.pow(value_fn_out - batch[Postprocessing.VALUE_TARGETS], 2.0)
+            vf_loss_clipped = torch.clamp(vf_loss, 0, self.config.vf_clip_param)
+            mean_vf_loss = torch.mean(vf_loss_clipped)
+        # Ignore the value function.
+        else:
+            value_fn_out = torch.tensor(0.0).to(surrogate_loss.device)
+            vf_loss_clipped = mean_vf_loss = torch.tensor(0.0).to(surrogate_loss.device)
+
+        total_loss = torch.mean(
+            -surrogate_loss
+            + self.config.vf_loss_coeff * vf_loss_clipped
+            - self.config.entropy_coeff * curr_entropy
+        )
+
+        # Add mean_kl_loss (already processed through `reduce_mean_valid`),
+        # if necessary.
+        if self.config.kl_coeff > 0.0:
+            total_loss += self.config.kl_coeff * mean_kl_loss
+
+        return {
+            self.TOTAL_LOSS_KEY: total_loss,
+            "mean_policy_loss": -torch.mean(surrogate_loss),
+            "mean_vf_loss": mean_vf_loss,
+            "vf_explained_var": explained_variance(
+                batch[Postprocessing.VALUE_TARGETS], value_fn_out
+            ),
+            "mean_entropy": mean_entropy,
+            "mean_kl_loss": mean_kl_loss,
+        }
 
     def additional_update(self, *, sampled_kl_values) -> Mapping[str, Any]:
-        
+
         results = {}
         for module_id in sampled_kl_values:
             sampled_kl = sampled_kl_values[module_id]
             if sampled_kl > 2.0 * self.kl_target:
-                # TODO (Kourosh) why not 2? 
+                # TODO (Kourosh) why not 2?
                 self.kl_coeff *= 1.5
             elif sampled_kl < 0.5 * self.kl_target:
                 self.kl_coeff *= 0.5
-            
+
             results[module_id] = {"kl_coeff": self.kl_coeff}
 
         return results
-        

From 15b99ee3bd0230d5b2bb2abb015cbf0bc9331419 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 16:57:05 -0800
Subject: [PATCH 035/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../ppo/torch/ppo_torch_rl_trainer.py         | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index e0e7c8a54ee8..4221d6578a67 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -22,11 +22,16 @@ def __init__(self, *args, **kwargs):
         if self.config.entropy_coeff_schedule:
             raise ValueError("entropy_coeff_schedule is not supported in RLTrainer yet")
 
+        # TODO (Kourosh): Create a way on the base class for users to define arbitrary 
+        # schedulers for learning rates.
+        self.lr_scheduler = None
         if self.config.lr_schedule:
             raise ValueError("lr_schedule is not supported in RLTrainer yet")
 
         # TODO (Kourosh): We can still use mix-ins in the new design. Do we want that?
-        # Most likely not.
+        # Most likely not. I rather be specific about everything. kl_coeff is a 
+        # none-gradient based update which we can define here and add as update with 
+        # additional_update() method.
         self.kl_coeff = self.config.kl_coeff
         self.kl_target = self.config.kl_target
 
@@ -42,7 +47,7 @@ def compute_loss(
             module_batch = batch[module_id]
             module_fwd_out = fwd_out[module_id]
 
-            module_results = self._compute_loss(module_batch, module_fwd_out)
+            module_results = self._compute_loss(module_id, module_batch, module_fwd_out)
             results_all_agents[module_id] = module_results
             loss = module_results[self.TOTAL_LOSS_KEY]
 
@@ -56,8 +61,11 @@ def compute_loss(
         return results_all_agents
 
     def _compute_loss(
-        self, batch: SampleBatch, fwd_out: Mapping[str, TensorType]
+        self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType]
     ) -> TensorType:
+        # TODO (Kourosh): We may or may not user module_id. For example if we have an 
+        # agent based learning rate scheduler, we may want to use module_id to get the 
+        # learning rate for that agent.
         # TODO (Kourosh): come back to RNNs later
 
         curr_action_dist = fwd_out[SampleBatch.ACTION_DIST]
@@ -122,7 +130,7 @@ def _compute_loss(
             "mean_kl_loss": mean_kl_loss,
         }
 
-    def additional_update(self, *, sampled_kl_values) -> Mapping[str, Any]:
+    def additional_update(self, *, sampled_kl_values, timestep: int) -> Mapping[str, Any]:
 
         results = {}
         for module_id in sampled_kl_values:
@@ -135,4 +143,11 @@ def additional_update(self, *, sampled_kl_values) -> Mapping[str, Any]:
 
             results[module_id] = {"kl_coeff": self.kl_coeff}
 
+
+        if self.entropy_coeff_scheduler is not None:
+            self.entropy_coeff_scheduleler.update(timestep)
+
+        if self.lr_scheduler is not None:
+            self.lr_scheduleler.update(timestep)
+
         return results

From 8b9ae921f19facb5326c310cf0664a6e41bbb5e8 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 17:04:55 -0800
Subject: [PATCH 036/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../ppo/torch/ppo_torch_rl_trainer.py         | 57 ++++++++++++-------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index 4221d6578a67..e72a480edf0b 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -1,5 +1,4 @@
 from typing import Mapping, Any, Union
-from collections import defaultdict
 
 from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
 from ray.rllib.evaluation.postprocessing import Postprocessing
@@ -22,15 +21,15 @@ def __init__(self, *args, **kwargs):
         if self.config.entropy_coeff_schedule:
             raise ValueError("entropy_coeff_schedule is not supported in RLTrainer yet")
 
-        # TODO (Kourosh): Create a way on the base class for users to define arbitrary 
+        # TODO (Kourosh): Create a way on the base class for users to define arbitrary
         # schedulers for learning rates.
         self.lr_scheduler = None
         if self.config.lr_schedule:
             raise ValueError("lr_schedule is not supported in RLTrainer yet")
 
         # TODO (Kourosh): We can still use mix-ins in the new design. Do we want that?
-        # Most likely not. I rather be specific about everything. kl_coeff is a 
-        # none-gradient based update which we can define here and add as update with 
+        # Most likely not. I rather be specific about everything. kl_coeff is a
+        # none-gradient based update which we can define here and add as update with
         # additional_update() method.
         self.kl_coeff = self.config.kl_coeff
         self.kl_target = self.config.kl_target
@@ -42,13 +41,15 @@ def compute_loss(
         # TODO (Kourosh): This is boiler plate code. Can we minimize this?
 
         loss_total = None
-        results_all_agents = defaultdict(dict)
+        results_all_modules = {}
         for module_id in fwd_out:
             module_batch = batch[module_id]
             module_fwd_out = fwd_out[module_id]
 
-            module_results = self._compute_loss(module_id, module_batch, module_fwd_out)
-            results_all_agents[module_id] = module_results
+            module_results = self._compute_loss_per_module(
+                module_id, module_batch, module_fwd_out
+            )
+            results_all_modules[module_id] = module_results
             loss = module_results[self.TOTAL_LOSS_KEY]
 
             if loss_total is None:
@@ -56,15 +57,15 @@ def compute_loss(
             else:
                 loss_total += loss
 
-        results_all_agents[self.TOTAL_LOSS_KEY] = loss_total
+        results_all_modules[self.TOTAL_LOSS_KEY] = loss_total
 
-        return results_all_agents
+        return results_all_modules
 
-    def _compute_loss(
+    def _compute_loss_per_module(
         self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType]
     ) -> TensorType:
-        # TODO (Kourosh): We may or may not user module_id. For example if we have an 
-        # agent based learning rate scheduler, we may want to use module_id to get the 
+        # TODO (Kourosh): We may or may not user module_id. For example if we have an
+        # agent based learning rate scheduler, we may want to use module_id to get the
         # learning rate for that agent.
         # TODO (Kourosh): come back to RNNs later
 
@@ -130,20 +131,32 @@ def _compute_loss(
             "mean_kl_loss": mean_kl_loss,
         }
 
-    def additional_update(self, *, sampled_kl_values, timestep: int) -> Mapping[str, Any]:
+    def additional_update(self, *args, **kwargs) -> Mapping[str, Any]:
 
-        results = {}
-        for module_id in sampled_kl_values:
-            sampled_kl = sampled_kl_values[module_id]
-            if sampled_kl > 2.0 * self.kl_target:
-                # TODO (Kourosh) why not 2?
-                self.kl_coeff *= 1.5
-            elif sampled_kl < 0.5 * self.kl_target:
-                self.kl_coeff *= 0.5
+        results_all_modules = {}
+        for module_id in self._module.keys():
+            module_results = self._additional_update_per_module(
+                module_id, *args, **kwargs
+            )
+            results_all_modules[module_id] = module_results
 
-            results[module_id] = {"kl_coeff": self.kl_coeff}
+        return results_all_modules
 
+    def _additional_update_per_module(
+        self, module_id: str, sampled_kl_values: dict, timestep: int
+    ) -> Mapping[str, Any]:
 
+        sampled_kl = sampled_kl_values[module_id]
+        if sampled_kl > 2.0 * self.kl_target:
+            # TODO (Kourosh) why not 2?
+            self.kl_coeff *= 1.5
+        elif sampled_kl < 0.5 * self.kl_target:
+            self.kl_coeff *= 0.5
+
+        results = {"kl_coeff": self.kl_coeff}
+
+        # TODO (Kourosh): We may want to index into the schedulers to get the right one
+        # for this module
         if self.entropy_coeff_scheduler is not None:
             self.entropy_coeff_scheduleler.update(timestep)
 

From eb82e670f60752f3c0a87d8aca04af0e7db57fe2 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 17:05:59 -0800
Subject: [PATCH 037/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index e72a480edf0b..630a732f9b2d 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -38,7 +38,7 @@ def compute_loss(
         self, *, fwd_out: MultiAgentBatch, batch: MultiAgentBatch
     ) -> Union[TensorType, Mapping[str, Any]]:
 
-        # TODO (Kourosh): This is boiler plate code. Can we minimize this?
+        # TODO (Kourosh): This is boiler plate code. Move it to the base class?
 
         loss_total = None
         results_all_modules = {}
@@ -132,6 +132,7 @@ def _compute_loss_per_module(
         }
 
     def additional_update(self, *args, **kwargs) -> Mapping[str, Any]:
+        # TODO (Kourosh): This is boiler plate code. Move it to the base class?
 
         results_all_modules = {}
         for module_id in self._module.keys():

From dac0d6b6745c3879b4a6fea7ab3e8d9a6b0f8ff4 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 17:26:36 -0800
Subject: [PATCH 038/112] fixed trainer_runner config test

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../rl_trainer/tests/test_trainer_runner_config.py     |  1 -
 rllib/core/rl_trainer/trainer_runner_config.py         | 10 ----------
 rllib/core/testing/utils.py                            | 10 +++++++---
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_config.py b/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
index 134d32ea74f6..16a7ddfa7c5a 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
@@ -34,7 +34,6 @@ def test_trainer_runner_build(self):
                 trainer_class=BCTfRLTrainer,
             )
         )
-        config = config.testing(True)
         config.build()
 
     def test_trainer_runner_build_from_algorithm_config(self):
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index f429759b73e9..f68aa95cd764 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -34,9 +34,6 @@ def __init__(self, cls: Type[TrainerRunner] = None) -> None:
         self.num_gpus = 0
         self.fake_gpus = False
 
-        # `self.testing()`
-        self._in_test = False
-
     def validate(self) -> None:
 
         if self.module_class is None and self.module_obj is None:
@@ -95,7 +92,6 @@ def build(self) -> TrainerRunner:
                 # TODO (Avnish): add this
                 # "enable_tf_function": self.eager_tracing,
                 "optimizer_config": self.optimizer_config,
-                "in_test": self._in_test,
             },
             compute_config={
                 "num_gpus": self.num_gpus,
@@ -163,9 +159,3 @@ def resources(
             self.fake_gpus = fake_gpus
 
         return self
-
-    def testing(self, _in_test: Optional[bool] = NotProvided) -> "TrainerRunnerConfig":
-        if _in_test is not NotProvided:
-            self._in_test = _in_test
-
-        return self
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 76f86c835c3b..95e24758d62f 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -1,19 +1,19 @@
 from typing import Type, Union, TYPE_CHECKING
 
-import torch
-import tensorflow as tf
 
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 
 if TYPE_CHECKING:
     import gymnasium as gym
+    import torch
+    import tensorflow as tf
 
     from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
     from ray.rllib.core.rl_module import RLModule
 
 
-Optimizer = Union[tf.keras.optimizers.Optimizer, torch.optim.Optimizer]
+Optimizer = Union["tf.keras.optimizers.Optimizer", "torch.optim.Optimizer"]
 
 
 @DeveloperAPI
@@ -47,8 +47,12 @@ def get_module_class(framework: str) -> Type["RLModule"]:
 @DeveloperAPI
 def get_optimizer_default_class(framework: str) -> Type[Optimizer]:
     if framework == "tf":
+        import tensorflow as tf
+
         return tf.keras.optimizers.Adam
     elif framework == "torch":
+        import torch
+
         return torch.optim.Adam
     else:
         raise ValueError(f"Unsupported framework: {framework}")

From cfdaa047732b7086c39fe971d08ab285da58ddd7 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 18:41:59 -0800
Subject: [PATCH 039/112] removed the stuff that got moved to SARLTrainer made
 easy PR

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../ppo/torch/ppo_torch_rl_trainer.py         | 39 -------------------
 1 file changed, 39 deletions(-)

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index 630a732f9b2d..cd2f7c2e4359 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -34,33 +34,6 @@ def __init__(self, *args, **kwargs):
         self.kl_coeff = self.config.kl_coeff
         self.kl_target = self.config.kl_target
 
-    def compute_loss(
-        self, *, fwd_out: MultiAgentBatch, batch: MultiAgentBatch
-    ) -> Union[TensorType, Mapping[str, Any]]:
-
-        # TODO (Kourosh): This is boiler plate code. Move it to the base class?
-
-        loss_total = None
-        results_all_modules = {}
-        for module_id in fwd_out:
-            module_batch = batch[module_id]
-            module_fwd_out = fwd_out[module_id]
-
-            module_results = self._compute_loss_per_module(
-                module_id, module_batch, module_fwd_out
-            )
-            results_all_modules[module_id] = module_results
-            loss = module_results[self.TOTAL_LOSS_KEY]
-
-            if loss_total is None:
-                loss_total = loss
-            else:
-                loss_total += loss
-
-        results_all_modules[self.TOTAL_LOSS_KEY] = loss_total
-
-        return results_all_modules
-
     def _compute_loss_per_module(
         self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType]
     ) -> TensorType:
@@ -131,18 +104,6 @@ def _compute_loss_per_module(
             "mean_kl_loss": mean_kl_loss,
         }
 
-    def additional_update(self, *args, **kwargs) -> Mapping[str, Any]:
-        # TODO (Kourosh): This is boiler plate code. Move it to the base class?
-
-        results_all_modules = {}
-        for module_id in self._module.keys():
-            module_results = self._additional_update_per_module(
-                module_id, *args, **kwargs
-            )
-            results_all_modules[module_id] = module_results
-
-        return results_all_modules
-
     def _additional_update_per_module(
         self, module_id: str, sampled_kl_values: dict, timestep: int
     ) -> Mapping[str, Any]:

From 7b5938bf23edc089fed1be9b84f71e752d9650d0 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 19 Jan 2023 21:41:40 -0800
Subject: [PATCH 040/112] fixed torch import

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/torch/torch_rl_trainer.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 695b29b2891e..4a92eabadaee 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -9,10 +9,6 @@
     Optional,
     Callable,
 )
-import torch
-
-from ray.air.config import ScalingConfig
-from ray.train.torch.train_loop_utils import _TorchAccelerator
 
 from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
 from ray.rllib.core.rl_trainer.rl_trainer import (
@@ -28,6 +24,13 @@
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
+from ray.rllib.utils.framework import try_import_torch
+
+torch, nn = try_import_torch()
+
+if torch:
+    from ray.air.config import ScalingConfig
+    from ray.train.torch.train_loop_utils import _TorchAccelerator
 
 logger = logging.getLogger(__name__)
 

From f4cbe5a0e76c6fe50f012e634e92462b1b0734b9 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 20 Jan 2023 08:56:54 -0800
Subject: [PATCH 041/112] removed the override decorator for nn.Module

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/torch/torch_rl_module.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py
index 39f15c79cc51..03f27693ce29 100644
--- a/rllib/core/rl_module/torch/torch_rl_module.py
+++ b/rllib/core/rl_module/torch/torch_rl_module.py
@@ -11,7 +11,6 @@ def __init__(self, *args, **kwargs) -> None:
         nn.Module.__init__(self)
         RLModule.__init__(self, *args, **kwargs)
 
-    @override(nn.Module)
     def forward(self, batch: Mapping[str, Any], **kwargs) -> Mapping[str, Any]:
         """forward pass of the module.
 

From 9645137491d61bdfd409101f213a69b3ac358bb6 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 20 Jan 2023 09:36:58 -0800
Subject: [PATCH 042/112] added unittest (wip)

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../ppo/tests/test_ppo_rl_trainer.py          | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py

diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
new file mode 100644
index 000000000000..c6f60eed8c1b
--- /dev/null
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
@@ -0,0 +1,81 @@
+
+import ray
+import unittest
+import numpy as np
+
+import ray.rllib.algorithms.ppo as ppo
+from ray.rllib.policy.sample_batch import SampleBatch
+
+from ray.rllib.evaluation.postprocessing import (
+    compute_gae_for_sample_batch,
+)
+
+# Fake CartPole episode of n time steps.
+FAKE_BATCH = SampleBatch(
+    {
+        SampleBatch.OBS: np.array(
+            [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]],
+            dtype=np.float32,
+        ),
+        SampleBatch.ACTIONS: np.array([0, 1, 1]),
+        SampleBatch.PREV_ACTIONS: np.array([0, 1, 1]),
+        SampleBatch.REWARDS: np.array([1.0, -1.0, 0.5], dtype=np.float32),
+        SampleBatch.PREV_REWARDS: np.array([1.0, -1.0, 0.5], dtype=np.float32),
+        SampleBatch.TERMINATEDS: np.array([False, False, True]),
+        SampleBatch.TRUNCATEDS: np.array([False, False, False]),
+        SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32),
+        SampleBatch.ACTION_DIST_INPUTS: np.array(
+            [[-2.0, 0.5], [-3.0, -0.3], [-0.1, 2.5]], dtype=np.float32
+        ),
+        SampleBatch.ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32),
+        SampleBatch.EPS_ID: np.array([0, 0, 0]),
+        SampleBatch.AGENT_INDEX: np.array([0, 0, 0]),
+    }
+)
+
+
+class TestPPO(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        ray.init()
+
+    @classmethod
+    def tearDownClass(cls):
+        ray.shutdown()
+
+
+    def test_loss(self):
+        config = (
+            ppo.PPOConfig()
+            .environment("CartPole-v1")
+            .framework("torch")
+            .rollouts(
+                num_rollout_workers=0,
+            )
+            .training(
+                gamma=0.99,
+                model=dict(
+                    fcnet_hiddens=[10],
+                    fcnet_activation="linear",
+                    vf_share_layers=True,
+                ),
+            )
+            .rl_module(
+                _enable_rl_module_api=True,
+            )
+        )
+
+        trainer = config.build()
+        policy = trainer.get_policy()
+        train_batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy())
+
+        policy_loss = policy.loss(policy.model, policy.dist_class, train_batch)
+        rl_trainer_loss = ...
+        breakpoint()
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))

From eac5223d902a2bbc13b3ba7571f4a50dbe4d8efc Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 20 Jan 2023 09:48:28 -0800
Subject: [PATCH 043/112] fixed import torch in bc_module.py

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/testing/torch/bc_module.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/rllib/core/testing/torch/bc_module.py b/rllib/core/testing/torch/bc_module.py
index 986ef9659da8..bdbf8ced9a37 100644
--- a/rllib/core/testing/torch/bc_module.py
+++ b/rllib/core/testing/torch/bc_module.py
@@ -1,17 +1,16 @@
 import gymnasium as gym
 from typing import Any, Mapping, Union
 
-import torch.nn as nn
-import torch
-from torch.distributions import Categorical
-
 from ray.rllib.core.rl_module import RLModule
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
 from ray.rllib.models.specs.specs_torch import TorchTensorSpec
 from ray.rllib.models.specs.typing import SpecType
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.nested_dict import NestedDict
 
+torch, nn = try_import_torch()
+
 
 class DiscreteBCTorchModule(TorchRLModule):
     def __init__(
@@ -68,7 +67,7 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]:
     @override(RLModule)
     def _forward_train(self, batch: NestedDict) -> Mapping[str, Any]:
         action_logits = self.policy(batch["obs"])
-        return {"action_dist": Categorical(logits=action_logits)}
+        return {"action_dist": torch.distributions.Categorical(logits=action_logits)}
 
     @classmethod
     @override(RLModule)

From 0baa3cfb485e05f47aaad60e6e6650fb290c9b4b Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 20 Jan 2023 11:11:44 -0800
Subject: [PATCH 044/112] fixed the bazel bug where the working directory gets
 switched to where the unittest is locateed and import torch would import the
 relative torch module instead of the global torch moduel

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD                                                     | 2 +-
 rllib/core/rl_trainer/{tests/torch => torch/tests}/__init__.py  | 0
 .../{tests/torch => torch/tests}/test_torch_rl_trainer.py       | 0
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename rllib/core/rl_trainer/{tests/torch => torch/tests}/__init__.py (100%)
 rename rllib/core/rl_trainer/{tests/torch => torch/tests}/test_torch_rl_trainer.py (100%)

diff --git a/rllib/BUILD b/rllib/BUILD
index 037f25f60477..f1c52cf91da6 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1858,7 +1858,7 @@ py_test(
     name = "test_torch_rl_trainer",
     tags = ["team:rllib", "core"],
     size = "medium",
-    srcs = ["core/rl_trainer/tests/torch/test_torch_rl_trainer.py"]
+    srcs = ["core/rl_trainer/torch/tests/test_torch_rl_trainer.py"]
 )
 
 # --------------------------------------------------------------------
diff --git a/rllib/core/rl_trainer/tests/torch/__init__.py b/rllib/core/rl_trainer/torch/tests/__init__.py
similarity index 100%
rename from rllib/core/rl_trainer/tests/torch/__init__.py
rename to rllib/core/rl_trainer/torch/tests/__init__.py
diff --git a/rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
similarity index 100%
rename from rllib/core/rl_trainer/tests/torch/test_torch_rl_trainer.py
rename to rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py

From c125e208198ab4da558a5c035db497eeb8b2f7fb Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 20 Jan 2023 12:23:53 -0800
Subject: [PATCH 045/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/ppo.py                   | 11 +++
 .../ppo/tests/test_ppo_rl_trainer.py          | 67 ++++++++++++-------
 2 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 7b19a2f6e46f..21bfcdfe5f0d 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -127,6 +127,17 @@ def get_default_rl_module_class(self) -> Union[Type["RLModule"], str]:
         else:
             raise ValueError(f"The framework {self.framework_str} is not supported.")
 
+    @override(AlgorithmConfig)
+    def get_default_rl_trainer_class(self) -> Union[Type["RLTrainer"], str]:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_trainer import (
+                PPOTorchRLTrainer
+            )
+
+            return PPOTorchRLTrainer
+        else:
+            raise ValueError(f"The framework {self.framework_str} is not supported.")
+
     @override(AlgorithmConfig)
     def training(
         self,
diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
index c6f60eed8c1b..c706c8460050 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
@@ -2,8 +2,11 @@
 import ray
 import unittest
 import numpy as np
+import torch
+import tree # pip install dm-tree
 
 import ray.rllib.algorithms.ppo as ppo
+from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_trainer import PPOTorchRLTrainer
 from ray.rllib.policy.sample_batch import SampleBatch
 
 from ray.rllib.evaluation.postprocessing import (
@@ -11,27 +14,25 @@
 )
 
 # Fake CartPole episode of n time steps.
-FAKE_BATCH = SampleBatch(
-    {
-        SampleBatch.OBS: np.array(
-            [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]],
-            dtype=np.float32,
-        ),
-        SampleBatch.ACTIONS: np.array([0, 1, 1]),
-        SampleBatch.PREV_ACTIONS: np.array([0, 1, 1]),
-        SampleBatch.REWARDS: np.array([1.0, -1.0, 0.5], dtype=np.float32),
-        SampleBatch.PREV_REWARDS: np.array([1.0, -1.0, 0.5], dtype=np.float32),
-        SampleBatch.TERMINATEDS: np.array([False, False, True]),
-        SampleBatch.TRUNCATEDS: np.array([False, False, False]),
-        SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32),
-        SampleBatch.ACTION_DIST_INPUTS: np.array(
-            [[-2.0, 0.5], [-3.0, -0.3], [-0.1, 2.5]], dtype=np.float32
-        ),
-        SampleBatch.ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32),
-        SampleBatch.EPS_ID: np.array([0, 0, 0]),
-        SampleBatch.AGENT_INDEX: np.array([0, 0, 0]),
-    }
-)
+FAKE_BATCH = {
+    SampleBatch.OBS: np.array(
+        [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]],
+        dtype=np.float32,
+    ),
+    SampleBatch.ACTIONS: np.array([0, 1, 1]),
+    SampleBatch.PREV_ACTIONS: np.array([0, 1, 1]),
+    SampleBatch.REWARDS: np.array([1.0, -1.0, 0.5], dtype=np.float32),
+    SampleBatch.PREV_REWARDS: np.array([1.0, -1.0, 0.5], dtype=np.float32),
+    SampleBatch.TERMINATEDS: np.array([False, False, True]),
+    SampleBatch.TRUNCATEDS: np.array([False, False, False]),
+    SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32),
+    SampleBatch.ACTION_DIST_INPUTS: {"logits": np.array(
+        [[-2.0, 0.5], [-3.0, -0.3], [-0.1, 2.5]], dtype=np.float32
+    )},
+    SampleBatch.ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32),
+    SampleBatch.EPS_ID: np.array([0, 0, 0]),
+    SampleBatch.AGENT_INDEX: np.array([0, 0, 0]),
+}
 
 
 class TestPPO(unittest.TestCase):
@@ -45,6 +46,7 @@ def tearDownClass(cls):
 
 
     def test_loss(self):
+
         config = (
             ppo.PPOConfig()
             .environment("CartPole-v1")
@@ -57,7 +59,7 @@ def test_loss(self):
                 model=dict(
                     fcnet_hiddens=[10],
                     fcnet_activation="linear",
-                    vf_share_layers=True,
+                    vf_share_layers=False,
                 ),
             )
             .rl_module(
@@ -67,10 +69,27 @@ def test_loss(self):
 
         trainer = config.build()
         policy = trainer.get_policy()
-        train_batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy())
+
+        train_batch = SampleBatch(FAKE_BATCH)
+        train_batch = compute_gae_for_sample_batch(policy, train_batch)
+
+        # convert to torch tensors with tree.map_structure
+        train_batch = tree.map_structure(
+            lambda x: torch.as_tensor(x).float(), train_batch
+        )
 
         policy_loss = policy.loss(policy.model, policy.dist_class, train_batch)
-        rl_trainer_loss = ...
+
+        config.training(
+            _enable_rl_trainer_api=True
+        )
+        config.validate()
+        config.freeze()
+        trainer_runner_config = config.get_trainer_runner_config(
+            policy.observation_space, policy.action_space
+        )
+        trainer_runner = trainer_runner_config.build()
+        results = trainer_runner.update(train_batch.as_multi_agent())
         breakpoint()
 
 
From c77be0ce7be41248f1efc1b99a70dc0ef04e6f1c Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 20 Jan 2023 17:29:31 -0800
Subject: [PATCH 046/112] fixed the unittest

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py  | 13 ++++++++++++-
 rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py | 13 ++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
index c706c8460050..2f2b09520760 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
@@ -8,6 +8,8 @@
 import ray.rllib.algorithms.ppo as ppo
 from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_trainer import PPOTorchRLTrainer
 from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+from ray.rllib.utils.test_utils import check
 
 from ray.rllib.evaluation.postprocessing import (
     compute_gae_for_sample_batch,
@@ -89,8 +91,17 @@ def test_loss(self):
             policy.observation_space, policy.action_space
         )
         trainer_runner = trainer_runner_config.build()
+
+        # load the policy weights into the trainer runner
+        state_dict = {"module_state": {"default_policy": policy.get_weights()}}
+        state_dict = convert_to_torch_tensor(state_dict)
+        trainer_runner.set_state(state_dict)
         results = trainer_runner.update(train_batch.as_multi_agent())
-        breakpoint()
+
+        trainer_runner_loss = results[0]["loss"]["total_loss"]
+
+        check(trainer_runner_loss, policy_loss)
+
 
 
 if __name__ == "__main__":
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index cd2f7c2e4359..6aab20373d89 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Mapping, Any, Union
 
 from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
@@ -5,13 +6,13 @@
 from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_utils import (
-    warn_if_infinite_kl_divergence,
     explained_variance,
 )
 from ray.rllib.utils.typing import TensorType
 
 torch, nn = try_import_torch()
 
+logger = logging.getLogger(__name__)
 
 class PPOTorchRLTrainer(TorchRLTrainer):
     def __init__(self, *args, **kwargs):
@@ -37,6 +38,7 @@ def __init__(self, *args, **kwargs):
     def _compute_loss_per_module(
         self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType]
     ) -> TensorType:
+        # TODO (Kourosh): batch type is NestedDict.
         # TODO (Kourosh): We may or may not user module_id. For example if we have an
         # agent based learning rate scheduler, we may want to use module_id to get the
         # learning rate for that agent.
@@ -44,7 +46,7 @@ def _compute_loss_per_module(
 
         curr_action_dist = fwd_out[SampleBatch.ACTION_DIST]
         action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST])
-        prev_action_dist = action_dist_class(**batch[SampleBatch.ACTION_DIST_INPUTS])
+        prev_action_dist = action_dist_class(**batch[SampleBatch.ACTION_DIST_INPUTS].asdict())
 
         logp_ratio = torch.exp(
             fwd_out[SampleBatch.ACTION_LOGP] - batch[SampleBatch.ACTION_LOGP]
@@ -54,9 +56,10 @@ def _compute_loss_per_module(
         if self.config.kl_coeff > 0.0:
             action_kl = prev_action_dist.kl(curr_action_dist)
             mean_kl_loss = torch.mean(action_kl)
-            # TODO smorad: should we do anything besides warn? Could discard KL term
-            # for this update
-            warn_if_infinite_kl_divergence(self, mean_kl_loss)
+            if mean_kl_loss.isinf():
+                logger.warning(
+                    "KL divergence is non-finite, this will likely destabilize your model and the training process. Action(s) in a specific state have near-zero probability. This can happen naturally in deterministic environments where the optimal policy has zero mass for a specific action. To fix this issue, consider setting the coefficient for the KL loss term to zero or increasing policy entropy."
+                )
         else:
             mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)
 

From 0e6f511d7604a410f87786136e91a7a77027b026 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 20 Jan 2023 18:45:07 -0800
Subject: [PATCH 047/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 47d50a12db8c..34b071910e30 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -674,14 +674,26 @@ def setup(self, config: AlgorithmConfig) -> None:
             # Need to add back method_type in case Algorithm is restored from checkpoint
             method_config["type"] = method_type
 
+
+        local_worker = self.workers.local_worker()
+        marl_config = {"modules": {}}
+
+        breakpoint()
+        for pid, policy in local_worker.policy_map:
+            marl_config["modules"][pid] = {
+                "module_class": policy.config["rl_module_class"],
+                "observation_space": policy.observation_space,
+                "action_space": policy.action_space,
+                "model_config": policy.config["model"]
+            }
+
+        from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
+        marl = MultiAgentRLModule.from_multi_agent_config(marl_config)
+        breakpoint()
+
         self.trainer_runner = None
         if self.config._enable_rl_trainer_api:
-            policy = self.get_policy()
-            observation_space = policy.observation_space
-            action_space = policy.action_space
-            trainer_runner_config = self.config.get_trainer_runner_config(
-                observation_space, action_space
-            )
+            trainer_runner_config = self.config.get_trainer_runner_config()
             self.trainer_runner = trainer_runner_config.build()
 
         # Run `on_algorithm_init` callback after initialization is done.

From 4bdd949772ad49d15fe25d228485136a6d1915a6 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 23 Jan 2023 19:50:57 -0800
Subject: [PATCH 048/112] added dataclass specs for RLModule and MARLModule for
 easier construction from RLTrainer side

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_module/marl_module.py           | 102 ++++++++++--------
 rllib/core/rl_module/rl_module.py             |  36 ++++---
 .../core/rl_module/tests/test_marl_module.py  | 100 ++++++++++-------
 rllib/core/rl_trainer/rl_trainer.py           |  52 +++++----
 .../core/rl_trainer/tests/test_rl_trainer.py  |  14 +--
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  22 ++--
 .../torch/tests/test_torch_rl_trainer.py      |  13 +--
 .../core/rl_trainer/torch/torch_rl_trainer.py |  22 ++--
 8 files changed, 214 insertions(+), 147 deletions(-)

diff --git a/rllib/core/rl_module/marl_module.py b/rllib/core/rl_module/marl_module.py
index 1a1f4f6d9648..a6b625a9b382 100644
--- a/rllib/core/rl_module/marl_module.py
+++ b/rllib/core/rl_module/marl_module.py
@@ -1,14 +1,15 @@
 import copy
+from dataclasses import dataclass
 import pprint
-from typing import Iterator, Mapping, Any, Union, Dict
+from typing import Iterator, Mapping, Any, Union, Dict, Optional, Type
 
 from ray.util.annotations import PublicAPI
-from ray.rllib.utils.annotations import override
+from ray.rllib.utils.annotations import override, ExperimentalAPI
 from ray.rllib.utils.nested_dict import NestedDict
 
 from ray.rllib.models.specs.specs_dict import SpecDict
 from ray.rllib.policy.sample_batch import MultiAgentBatch
-from ray.rllib.core.rl_module import RLModule
+from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec
 
 # TODO (Kourosh): change this to module_id later to enforce consistency
 from ray.rllib.utils.policy import validate_policy_id
@@ -16,6 +17,23 @@
 ModuleID = str
 
 
+@ExperimentalAPI
+@dataclass
+class MultiAgentRLModuleSpec:
+    """A utility spec class to make it constructing RLModules (in multi-agent case) easier.
+
+    Args:
+        module_class: ...
+        module_specs: ...
+    """
+
+    module_class: Optional[Type["MultiAgentRLModule"]] = None
+    module_specs: Optional[Dict[ModuleID, SingleAgentRLModuleSpec]] = None
+
+    def build(self) -> "MultiAgentRLModule":
+        return self.module_class.from_multi_agent_config({"modules": self.module_specs})
+
+
 def _get_module_configs(config: Dict[str, Any]):
     """Constructs a mapping from module_id to module config.
 
@@ -26,8 +44,9 @@ def _get_module_configs(config: Dict[str, Any]):
     module_specs = config.pop("modules", {})
     for common_spec in config:
         for module_spec in module_specs.values():
-            if common_spec not in module_spec:
-                module_spec[common_spec] = config[common_spec]
+            if getattr(module_spec, common_spec) is None:
+                setattr(module_spec, common_spec, config[common_spec])
+
     return module_specs
 
 
@@ -65,9 +84,8 @@ def from_multi_agent_config(cls, config: Mapping[str, Any]) -> "MultiAgentRLModu
         """Creates a MultiAgentRLModule from a multi-agent config.
 
         The input config should contain "modules" key that is a mapping from module_id
-        to the module spec for each RLModule. The module spec should be a dict with the
-        following keys: `module_class`, `observation_space`, `action_space`,
-        `model_config`. If there are multiple modules that do share the same
+        to the module spec for each RLModule which is a SingleAgentRLModuleSpec object.
+        If there are multiple modules that do share the same
         `observation_space`, `action_space`, or `model_config`, you can specify these
         keys at the top level of the config, and the module spec will inherit the
         values from the top level config.
@@ -78,16 +96,16 @@ def from_multi_agent_config(cls, config: Mapping[str, Any]) -> "MultiAgentRLModu
 
             config = {
                 "modules": {
-                    "module_1": {
-                        "module_class": "RLModule1",
-                        "observation_space": gym.spaces.Box(...),
-                        "action_space": gym.spaces.Discrete(...),
-                        "model_config": {hidden_dim: 256}
-                    },
-                    "module_2": {
-                        "module_class": "RLModule2",
-                        "observation_space": gym.spaces.Box(...),
-                    }
+                    "module_1": SingleAgentRLModuleSpec(
+                        module_class="RLModule1",
+                        observation_space=gym.spaces.Box(...),
+                        action_space=gym.spaces.Discrete(...),
+                        model_config={hidden_dim: 256}
+                    )
+                    "module_2": SingleAgentRLModuleSpec(
+                        module_class="RLModule2",
+                        observation_space=gym.spaces.Box(...),
+                    )
                 },
                 "action_space": gym.spaces.Box(...),
                 "model_config": {hidden_dim: 32}
@@ -97,17 +115,17 @@ def from_multi_agent_config(cls, config: Mapping[str, Any]) -> "MultiAgentRLModu
 
             config = {
                 "modules": {
-                    "module_1": {
-                        "module_class": "RLModule1",
-                        "observation_space": gym.spaces.Box(...),
-                        "action_space": gym.spaces.Discrete(...),
-                        "model_config": {hidden_dim: 256}
-                    },
-                    "module_2": {
-                        "module_class": "RLModule2",
-                        "observation_space": gym.spaces.Box(...),
-                        "action_space": gym.spaces.Box(...), # Inherited
-                        "model_config": {hidden_dim: 32} # Inherited
+                    "module_1": SingleAgentRLModuleSpec(
+                        module_class="RLModule1",
+                        observation_space=gym.spaces.Box(...),
+                        action_space=gym.spaces.Discrete(...),
+                        model_config={hidden_dim: 256}
+                    )
+                    "module_2": SingleAgentRLModuleSpec(
+                        module_class="RLModule2",
+                        observation_space=gym.spaces.Box(...),
+                        action_space=gym.spaces.Box(...), # Inherited
+                        model_config={hidden_dim: 32} # Inherited
                     }
                 },
             }
@@ -126,8 +144,9 @@ def from_multi_agent_config(cls, config: Mapping[str, Any]) -> "MultiAgentRLModu
         multiagent_module = cls()
 
         for module_id, module_spec in module_configs.items():
-            module_cls: RLModule = module_spec.pop("module_class")
-            module = module_cls.from_model_config(**module_spec)
+            # module_cls: RLModule = module_spec.pop("module_class")
+            # module = module_cls.from_model_config(**module_spec)
+            module = module_spec.build()
             multiagent_module.add_module(module_id, module)
 
         return multiagent_module
@@ -136,9 +155,7 @@ def from_multi_agent_config(cls, config: Mapping[str, Any]) -> "MultiAgentRLModu
     def __check_module_configs(cls, module_configs: Dict[ModuleID, Any]):
         """Checks the module configs for validity.
 
-        The module_configs be a mapping from module_ids to a dict that contains the
-        following required keys: `module_class`, `observation_space`, `action_space`,
-        `model_config`.
+        The module_configs be a mapping from module_ids to SingleAgentRLModuleSpec objects.
 
         Args:
             module_configs: The module configs to check.
@@ -146,19 +163,12 @@ def __check_module_configs(cls, module_configs: Dict[ModuleID, Any]):
         Raises:
             ValueError: If the module configs are invalid.
         """
-        REQUIRED_KEYS = {
-            "module_class",
-            "observation_space",
-            "action_space",
-            "model_config",
-        }
         for module_id, module_spec in module_configs.items():
-            for module_key in REQUIRED_KEYS:
-                if module_key not in module_spec:
-                    raise ValueError(
-                        f"Module config for module_id {module_id} is missing "
-                        f"required key {module_key}."
-                    )
+            if not isinstance(module_spec, SingleAgentRLModuleSpec):
+                raise ValueError(
+                    f"Module config for module_id {module_id} is missing "
+                    f"required key {module_key}."
+                )
 
     def keys(self) -> Iterator[ModuleID]:
         """Returns an iteratable of module ids."""
diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py
index c9c69443eebe..3d60fb62ee58 100644
--- a/rllib/core/rl_module/rl_module.py
+++ b/rllib/core/rl_module/rl_module.py
@@ -1,7 +1,7 @@
 import abc
 from dataclasses import dataclass
 import gymnasium as gym
-from typing import Mapping, Any, TYPE_CHECKING, Union
+from typing import Mapping, Any, TYPE_CHECKING, Union, Optional, Type, Dict
 
 if TYPE_CHECKING:
     from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
@@ -26,23 +26,27 @@
 
 @ExperimentalAPI
 @dataclass
-class RLModuleConfig:
-    """Configuration for the PPO module.
-
-    # TODO (Kourosh): Whether we need this or not really depends on how the catalog
-    # design end up being.
-
-    Attributes:
-        observation_space: The observation space of the environment.
-        action_space: The action space of the environment.
-        max_seq_len: Max seq len for training an RNN model.
-        (TODO (Kourosh) having max_seq_len here seems a bit unnatural, can we rethink
-        this design?)
+class SingleAgentRLModuleSpec:
+    """A utility spec class to make it constructing RLModules (in single-agent case) easier.
+
+    Args:
+        module_class: ...
+        observation_space: ...
+        action_space: ...
+        model_config: ...
     """
 
-    observation_space: gym.Space = None
-    action_space: gym.Space = None
-    max_seq_len: int = None
+    module_class: Optional[Type["RLModule"]] = None
+    observation_space: Optional["gym.Space"] = None
+    action_space: Optional["gym.Space"] = None
+    model_config: Optional[Dict[str, Any]] = None
+
+    def build(self) -> "RLModule":
+        return self.module_class.from_model_config(
+            observation_space=self.observation_space,
+            action_space=self.action_space,
+            model_config=self.model_config,
+        )
 
 
 @ExperimentalAPI
diff --git a/rllib/core/rl_module/tests/test_marl_module.py b/rllib/core/rl_module/tests/test_marl_module.py
index 3b3c7e5fdb7e..fa7a2f030525 100644
--- a/rllib/core/rl_module/tests/test_marl_module.py
+++ b/rllib/core/rl_module/tests/test_marl_module.py
@@ -1,11 +1,13 @@
 import unittest
 
 
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule, _get_module_configs
 from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
 from ray.rllib.env.multi_agent_env import make_multi_agent
 from ray.rllib.utils.test_utils import check
 
+
 DEFAULT_POLICY_ID = "default_policy"
 
 
@@ -39,14 +41,14 @@ def test_from_multi_agent_config(self):
 
         multi_agent_dict = {
             "modules": {
-                "module1": {
-                    "module_class": DiscreteBCTorchModule,
-                    "model_config": {"hidden_dim": 64},
-                },
-                "module2": {
-                    "module_class": DiscreteBCTorchModule,
-                    "model_config": {"hidden_dim": 32},
-                },
+                "module1": SingleAgentRLModuleSpec(
+                    module_class=DiscreteBCTorchModule,
+                    model_config={"hidden_dim": 64},
+                ),
+                "module2": SingleAgentRLModuleSpec(
+                    module_class=DiscreteBCTorchModule,
+                    model_config={"hidden_dim": 32},
+                ),
             },
             "observation_space": env.observation_space,  # this is common
             "action_space": env.action_space,  # this is common
@@ -160,57 +162,73 @@ def test_get_module_configs(self):
 
         config = {
             "modules": {
-                "1": {"module_class": "foo", "model_config": "bar"},
-                "2": {"module_class": "foo2", "model_config": "bar2"},
+                "1": SingleAgentRLModuleSpec(
+                    **{"module_class": "foo", "model_config": "bar"}
+                ),
+                "2": SingleAgentRLModuleSpec(
+                    **{"module_class": "foo2", "model_config": "bar2"}
+                ),
             },
             "observation_space": "obs_space",
             "action_space": "action_space",
         }
 
         expected_config = {
-            "1": {
-                "module_class": "foo",
-                "model_config": "bar",
-                "observation_space": "obs_space",
-                "action_space": "action_space",
-            },
-            "2": {
-                "module_class": "foo2",
-                "model_config": "bar2",
-                "observation_space": "obs_space",
-                "action_space": "action_space",
-            },
+            "1": SingleAgentRLModuleSpec(
+                **{
+                    "module_class": "foo",
+                    "model_config": "bar",
+                    "observation_space": "obs_space",
+                    "action_space": "action_space",
+                }
+            ),
+            "2": SingleAgentRLModuleSpec(
+                **{
+                    "module_class": "foo2",
+                    "model_config": "bar2",
+                    "observation_space": "obs_space",
+                    "action_space": "action_space",
+                }
+            ),
         }
 
         self.assertDictEqual(_get_module_configs(config), expected_config)
 
         config = {
             "modules": {
-                "1": {
-                    "module_class": "foo",
-                    "model_config": "bar",
-                    "observation_space": "obs_space1",  # won't get overwritten
-                    "action_space": "action_space1",  # won't get overwritten
-                },
-                "2": {"module_class": "foo2", "model_config": "bar2"},
+                "1": SingleAgentRLModuleSpec(
+                    **{
+                        "module_class": "foo",
+                        "model_config": "bar",
+                        "observation_space": "obs_space1",  # won't get overwritten
+                        "action_space": "action_space1",  # won't get overwritten
+                    }
+                ),
+                "2": SingleAgentRLModuleSpec(
+                    **{"module_class": "foo2", "model_config": "bar2"}
+                ),
             },
             "observation_space": "obs_space",
             "action_space": "action_space",
         }
 
         expected_config = {
-            "1": {
-                "module_class": "foo",
-                "model_config": "bar",
-                "observation_space": "obs_space1",
-                "action_space": "action_space1",
-            },
-            "2": {
-                "module_class": "foo2",
-                "model_config": "bar2",
-                "observation_space": "obs_space",
-                "action_space": "action_space",
-            },
+            "1": SingleAgentRLModuleSpec(
+                **{
+                    "module_class": "foo",
+                    "model_config": "bar",
+                    "observation_space": "obs_space1",
+                    "action_space": "action_space1",
+                }
+            ),
+            "2": SingleAgentRLModuleSpec(
+                **{
+                    "module_class": "foo2",
+                    "model_config": "bar2",
+                    "observation_space": "obs_space",
+                    "action_space": "action_space",
+                }
+            ),
         }
 
         self.assertDictEqual(_get_module_configs(config), expected_config)
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index b86168790555..7f72a68ef074 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -18,8 +18,15 @@
 )
 
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
-from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
-from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
+from ray.rllib.core.rl_module.rl_module import (
+    RLModule,
+    ModuleID,
+    SingleAgentRLModuleSpec,
+)
+from ray.rllib.core.rl_module.marl_module import (
+    MultiAgentRLModule,
+    MultiAgentRLModuleSpec,
+)
 from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.numpy import convert_to_numpy
@@ -105,9 +112,12 @@ class RLTrainer:
 
     def __init__(
         self,
-        module_class: Union[Type[RLModule], Type[MultiAgentRLModule]],
-        module_kwargs: Mapping[str, Any],
-        optimizer_config: Mapping[str, Any],
+        *,
+        module_spec: Optional[
+            Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
+        ] = None,
+        module: Optional[RLModule] = None,
+        optimizer_config: Mapping[str, Any] = None,
         distributed: bool = False,
         scaling_config: Optional["ScalingConfig"] = None,
         algorithm_config: Optional["AlgorithmConfig"] = None,
@@ -117,8 +127,18 @@ def __init__(
         # understand it. If we can find a better way to make subset of the config
         # available to the trainer, that would be great.
         # TODO (Kourosh): convert optimizer configs to dataclasses
-        self.module_class = module_class
-        self.module_kwargs = module_kwargs
+        if module_spec is not None and module is not None:
+            raise ValueError(
+                "Only one of module spec or module can be provided to RLTrainer."
+            )
+
+        if module_spec is None and module is None:
+            raise ValueError(
+                "Either module_spec or module should be provided to RLTrainer."
+            )
+
+        self.module_spec = module_spec
+        self.module_obj = module
         self.optimizer_config = optimizer_config
         self.distributed = distributed
         self.scaling_config = scaling_config
@@ -420,7 +440,7 @@ def add_module(
         set_optimizer_fn: Optional[Callable[[RLModule], ParamOptimizerPairs]] = None,
         optimizer_cls: Optional[Type[Optimizer]] = None,
     ) -> None:
-        """Add a module to the trainer.
+        """Add a module to the underlying MultiAgentRLModule and the trainer.
 
         Args:
             module_id: The id of the module to add.
@@ -492,19 +512,11 @@ def _make_module(self) -> MultiAgentRLModule:
         Returns:
             The constructed module.
         """
-
-        if issubclass(self.module_class, MultiAgentRLModule):
-            module = self.module_class.from_multi_agent_config(**self.module_kwargs)
-        elif issubclass(self.module_class, RLModule):
-            module = self.module_class.from_model_config(
-                **self.module_kwargs
-            ).as_multi_agent()
+        if self.module_obj is not None:
+            module = self.module_obj
         else:
-            raise ValueError(
-                f"Module class {self.module_class} is not a subclass of "
-                f"RLModule or MultiAgentRLModule."
-            )
-
+            module = self.module_spec.build()
+        module = module.as_multi_agent()
         return module
 
     def build(self) -> None:
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index db071ad2ed83..4f07ad36c754 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -5,11 +5,13 @@
 
 import ray
 
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
 from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
+from ray.rllib.core.testing.utils import add_module_to_runner_or_trainer
 
 
 def get_trainer(distributed=False) -> RLTrainer:
@@ -21,12 +23,12 @@ def get_trainer(distributed=False) -> RLTrainer:
     # and internally it will serialize and deserialize the module for distributed
     # construction.
     trainer = BCTfRLTrainer(
-        module_class=DiscreteBCTFModule,
-        module_kwargs={
-            "observation_space": env.observation_space,
-            "action_space": env.action_space,
-            "model_config": {"hidden_dim": 32},
-        },
+        module_spec=SingleAgentRLModuleSpec(
+            module_class=DiscreteBCTFModule,
+            observation_space=env.observation_space,
+            action_space=env.action_space,
+            model_config={"hidden_dim": 32},
+        ),
         optimizer_config={"lr": 1e-3},
         distributed=distributed,
     )
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 442932bc00fb..802fb6af4e37 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -14,14 +14,21 @@
 
 from ray.rllib.core.rl_trainer.rl_trainer import (
     RLTrainer,
-    MultiAgentRLModule,
     ParamOptimizerPairs,
     ParamRef,
     Optimizer,
     ParamType,
     ParamDictType,
 )
-from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
+from ray.rllib.core.rl_module.rl_module import (
+    RLModule,
+    ModuleID,
+    SingleAgentRLModuleSpec,
+)
+from ray.rllib.core.rl_module.marl_module import (
+    MultiAgentRLModule,
+    MultiAgentRLModuleSpec,
+)
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
@@ -86,8 +93,11 @@ class TfRLTrainer(RLTrainer):
 
     def __init__(
         self,
-        module_class: Union[Type[RLModule], Type[MultiAgentRLModule]],
-        module_kwargs: Mapping[str, Any],
+        *,
+        module_spec: Optional[
+            Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
+        ] = None,
+        module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
         enable_tf_function: bool = True,
@@ -95,8 +105,8 @@ def __init__(
         algorithm_config: Optional["AlgorithmConfig"] = None,
     ):
         super().__init__(
-            module_class=module_class,
-            module_kwargs=module_kwargs,
+            module_spec=module_spec,
+            module=module,
             optimizer_config=optimizer_config,
             distributed=distributed,
             scaling_config=scaling_config,
diff --git a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
index 04835b86dbf5..d82838b8f06c 100644
--- a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
@@ -5,6 +5,7 @@
 
 import ray
 
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
 from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
 from ray.rllib.core.testing.torch.bc_rl_trainer import BCTorchRLTrainer
@@ -26,12 +27,12 @@ def _get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
     # and internally it will serialize and deserialize the module for distributed
     # construction.
     trainer = BCTorchRLTrainer(
-        module_class=DiscreteBCTorchModule,
-        module_kwargs={
-            "observation_space": env.observation_space,
-            "action_space": env.action_space,
-            "model_config": {"hidden_dim": 32},
-        },
+        module_spec=SingleAgentRLModuleSpec(
+            module_class=DiscreteBCTorchModule,
+            observation_space=env.observation_space,
+            action_space=env.action_space,
+            model_config={"hidden_dim": 32},
+        ),
         scaling_config=scaling_config,
         optimizer_config={"lr": 1e-3},
         distributed=distributed,
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 43bf72c73121..d6b0bf1d7d28 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -11,10 +11,17 @@
     TYPE_CHECKING,
 )
 
-from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
+from ray.rllib.core.rl_module.rl_module import (
+    RLModule,
+    ModuleID,
+    SingleAgentRLModuleSpec,
+)
+from ray.rllib.core.rl_module.marl_module import (
+    MultiAgentRLModule,
+    MultiAgentRLModuleSpec,
+)
 from ray.rllib.core.rl_trainer.rl_trainer import (
     RLTrainer,
-    MultiAgentRLModule,
     ParamOptimizerPairs,
     Optimizer,
     ParamType,
@@ -45,16 +52,19 @@ class TorchRLTrainer(RLTrainer):
 
     def __init__(
         self,
-        module_class: Union[Type[RLModule], Type[MultiAgentRLModule]],
-        module_kwargs: Mapping[str, Any],
+        *,
+        module_spec: Optional[
+            Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
+        ] = None,
+        module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any],
         distributed: bool = False,
         scaling_config: Optional["ScalingConfig"] = None,
         algorithm_config: Optional["AlgorithmConfig"] = None,
     ):
         super().__init__(
-            module_class=module_class,
-            module_kwargs=module_kwargs,
+            module_spec=module_spec,
+            module=module,
             optimizer_config=optimizer_config,
             distributed=distributed,
             scaling_config=scaling_config,

From 58bbe82452afb797bdf9e90549b6b137e3600a9e Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 23 Jan 2023 20:15:55 -0800
Subject: [PATCH 049/112] test trainer runner local passed

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../tests/test_trainer_runner_local.py        | 76 +++++++++----------
 rllib/core/testing/utils.py                   | 49 ++++++++++--
 2 files changed, 76 insertions(+), 49 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
index 81b03b3a1ab4..3058f3c86f35 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
@@ -9,7 +9,11 @@
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-from ray.rllib.core.testing.utils import add_module_to_runner_or_trainer
+from ray.rllib.core.testing.utils import (
+    add_module_to_runner_or_trainer,
+    get_trainer_runner,
+    get_rl_trainer,
+)
 
 
 tf1, tf, tfv = try_import_tf()
@@ -31,47 +35,35 @@ def tearDown(cls) -> None:
 
     def test_trainer_runner_no_gpus(self):
         env = gym.make("CartPole-v1")
-        trainer_class = BCTfRLTrainer
-        trainer_cfg = dict(
-            module_class=DiscreteBCTFModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                "model_config": {"hidden_dim": 32},
-            },
-            optimizer_config={"lr": 1e-3},
-        )
-        runner = TrainerRunner(
-            trainer_class, trainer_cfg, compute_config=dict(num_gpus=0)
-        )
-
-        local_trainer = trainer_class(**trainer_cfg)
-        local_trainer.build()
-
-        # make the state of the trainer and the local runner identical
-        local_trainer.set_state(runner.get_state()[0])
-
-        reader = get_cartpole_dataset_reader(batch_size=500)
-        batch = reader.next()
-        batch = batch.as_multi_agent()
-        check(local_trainer.update(batch), runner.update(batch)[0])
-
-        new_module_id = "test_module"
-
-        add_module_to_runner_or_trainer("tf", env, new_module_id, runner)
-        add_module_to_runner_or_trainer("tf", env, new_module_id, local_trainer)
-
-        # make the state of the trainer and the local runner identical
-        local_trainer.set_state(runner.get_state()[0])
-
-        # do another update
-        batch = reader.next()
-        ma_batch = MultiAgentBatch(
-            {new_module_id: batch, DEFAULT_POLICY_ID: batch}, env_steps=batch.count
-        )
-        check(local_trainer.update(ma_batch), runner.update(ma_batch)[0])
-
-        check(local_trainer.get_state(), runner.get_state()[0])
+        for fw in ["tf", "torch"]:
+            runner = get_trainer_runner(fw, env, compute_config=dict(num_gpus=0))
+            local_trainer = get_rl_trainer(fw, env)
+            local_trainer.build()
+
+            # make the state of the trainer and the local runner identical
+            local_trainer.set_state(runner.get_state()[0])
+
+            reader = get_cartpole_dataset_reader(batch_size=500)
+            batch = reader.next()
+            batch = batch.as_multi_agent()
+            check(local_trainer.update(batch), runner.update(batch)[0])
+
+            new_module_id = "test_module"
+
+            add_module_to_runner_or_trainer(fw, env, new_module_id, runner)
+            add_module_to_runner_or_trainer(fw, env, new_module_id, local_trainer)
+
+            # make the state of the trainer and the local runner identical
+            local_trainer.set_state(runner.get_state()[0])
+
+            # do another update
+            batch = reader.next()
+            ma_batch = MultiAgentBatch(
+                {new_module_id: batch, DEFAULT_POLICY_ID: batch}, env_steps=batch.count
+            )
+            check(local_trainer.update(ma_batch), runner.update(ma_batch)[0])
+
+            check(local_trainer.get_state(), runner.get_state()[0])
 
 
 if __name__ == "__main__":
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 95e24758d62f..4493665e0091 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -1,9 +1,13 @@
 from typing import Type, Union, TYPE_CHECKING
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 
 
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 
+from rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
+from rllib.core.rl_module.tests.test_marl_module import DEFAULT_POLICY_ID
+
 if TYPE_CHECKING:
     import gymnasium as gym
     import torch
@@ -44,6 +48,25 @@ def get_module_class(framework: str) -> Type["RLModule"]:
         raise ValueError(f"Unsupported framework: {framework}")
 
 
+@DeveloperAPI
+def get_module_spec(framework: str, env: "gym.Env", is_multi_agent: bool = False):
+
+    spec = SingleAgentRLModuleSpec(
+        module_class=get_module_class(framework),
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        model_config={"hidden_dim": 32},
+    )
+
+    if is_multi_agent:
+        # TODO (Kourosh): Make this more multi-agent for example with policy ids "1", and "2".
+        return MultiAgentRLModuleSpec(
+            module_class=MultiAgentRLModule, module_specs={DEFAULT_POLICY_ID: spec}
+        )
+    else:
+        return spec
+
+
 @DeveloperAPI
 def get_optimizer_default_class(framework: str) -> Type[Optimizer]:
     if framework == "tf":
@@ -58,18 +81,30 @@ def get_optimizer_default_class(framework: str) -> Type[Optimizer]:
         raise ValueError(f"Unsupported framework: {framework}")
 
 
+@DeveloperAPI
+def get_rl_trainer(
+    framework: str,
+    env: "gym.Env",
+    is_multi_agent: bool = False,
+) -> "RLTrainer":
+
+    _cls = get_trainer_class(framework)
+    spec = get_module_spec(framework=framework, env=env, is_multi_agent=is_multi_agent)
+    return _cls(module_spec=spec, optimizer_config={"lr": 0.1})
+
+
 @DeveloperAPI
 def get_trainer_runner(
-    framework: str, env: "gym.Env", compute_config: dict
+    framework: str,
+    env: "gym.Env",
+    compute_config: dict,
+    is_multi_agent: bool = False,
 ) -> TrainerRunner:
     trainer_class = get_trainer_class(framework)
     trainer_cfg = dict(
-        module_class=get_module_class(framework),
-        module_kwargs={
-            "observation_space": env.observation_space,
-            "action_space": env.action_space,
-            "model_config": {"hidden_dim": 32},
-        },
+        module_spec=get_module_spec(
+            framework=framework, env=env, is_multi_agent=is_multi_agent
+        ),
         optimizer_config={"lr": 0.1},
     )
     runner = TrainerRunner(trainer_class, trainer_cfg, compute_config=compute_config)

From 93b27ecd62800192c98ebe244c38b32948f10dc2 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 23 Jan 2023 20:39:41 -0800
Subject: [PATCH 050/112] add_module() api is now update to accept a
 module_spec instead of module_Cclass and module_kwargs

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py            |  5 ++---
 rllib/core/rl_trainer/tests/test_rl_trainer.py | 13 ++++++-------
 rllib/core/rl_trainer/tf/tf_rl_trainer.py      |  9 +++------
 .../torch/tests/test_torch_rl_trainer.py       | 13 ++++++-------
 .../core/rl_trainer/torch/torch_rl_trainer.py  |  6 ++----
 rllib/core/rl_trainer/trainer_runner.py        | 18 +++++++++---------
 rllib/core/testing/utils.py                    |  7 +------
 7 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 7f72a68ef074..8969cab69a1e 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -435,8 +435,7 @@ def add_module(
         self,
         *,
         module_id: ModuleID,
-        module_cls: Type[RLModule],
-        module_kwargs: Mapping[str, Any],
+        module_spec: SingleAgentRLModuleSpec,
         set_optimizer_fn: Optional[Callable[[RLModule], ParamOptimizerPairs]] = None,
         optimizer_cls: Optional[Type[Optimizer]] = None,
     ) -> None:
@@ -455,7 +454,7 @@ def add_module(
                 should be provided.
         """
         self.__check_if_build_called()
-        module = module_cls.from_model_config(**module_kwargs)
+        module = module_spec.build()
 
         # construct a default set_optimizer_fn if not provided
         if set_optimizer_fn is None:
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index 4f07ad36c754..f6c6c954c554 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -127,13 +127,12 @@ def set_optimizer_fn(module):
 
         trainer.add_module(
             module_id="test",
-            module_cls=DiscreteBCTFModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                # the hidden size is different than the default module
-                "model_config": {"hidden_dim": 16},
-            },
+            module_spec=SingleAgentRLModuleSpec(
+                module_class=DiscreteBCTFModule,
+                observation_space=env.observation_space,
+                action_space=env.action_space,
+                model_config={"hidden_dim": 16},
+            ),
             set_optimizer_fn=set_optimizer_fn,
         )
 
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 802fb6af4e37..44a8f6d21581 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -205,8 +205,7 @@ def add_module(
         self,
         *,
         module_id: ModuleID,
-        module_cls: Type[RLModule],
-        module_kwargs: Mapping[str, Any],
+        module_spec: SingleAgentRLModuleSpec,
         set_optimizer_fn: Optional[Callable[[RLModule], ParamOptimizerPairs]] = None,
         optimizer_cls: Optional[Type[Optimizer]] = None,
     ) -> None:
@@ -214,16 +213,14 @@ def add_module(
             with self.strategy.scope():
                 super().add_module(
                     module_id=module_id,
-                    module_cls=module_cls,
-                    module_kwargs=module_kwargs,
+                    module_spec=module_spec,
                     set_optimizer_fn=set_optimizer_fn,
                     optimizer_cls=optimizer_cls,
                 )
         else:
             super().add_module(
                 module_id=module_id,
-                module_cls=module_cls,
-                module_kwargs=module_kwargs,
+                module_spec=module_spec,
                 set_optimizer_fn=set_optimizer_fn,
                 optimizer_cls=optimizer_cls,
             )
diff --git a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
index d82838b8f06c..40806048b6dc 100644
--- a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
@@ -130,13 +130,12 @@ def set_optimizer_fn(module):
 
         trainer.add_module(
             module_id="test",
-            module_cls=DiscreteBCTorchModule,
-            module_kwargs={
-                "observation_space": env.observation_space,
-                "action_space": env.action_space,
-                # the hidden size is different than the default module
-                "model_config": {"hidden_dim": 16},
-            },
+            module_spec=SingleAgentRLModuleSpec(
+                module_class=DiscreteBCTorchModule,
+                observation_space=env.observation_space,
+                action_space=env.action_space,
+                model_config={"hidden_dim": 16},
+            ),
             set_optimizer_fn=set_optimizer_fn,
         )
 
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index d6b0bf1d7d28..b8313b47a584 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -193,15 +193,13 @@ def add_module(
         self,
         *,
         module_id: ModuleID,
-        module_cls: Type[RLModule],
-        module_kwargs: Mapping[str, Any],
+        module_spec: SingleAgentRLModuleSpec,
         set_optimizer_fn: Optional[Callable[[RLModule], ParamOptimizerPairs]] = None,
         optimizer_cls: Optional[Type[Optimizer]] = None,
     ) -> None:
         super().add_module(
             module_id=module_id,
-            module_cls=module_cls,
-            module_kwargs=module_kwargs,
+            module_spec=module_spec,
             set_optimizer_fn=set_optimizer_fn,
             optimizer_cls=optimizer_cls,
         )
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 59fba884ab7d..c6fcda3deacf 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -3,7 +3,11 @@
 
 import ray
 
-from ray.rllib.core.rl_module.rl_module import RLModule, ModuleID
+from ray.rllib.core.rl_module.rl_module import (
+    RLModule,
+    ModuleID,
+    SingleAgentRLModuleSpec,
+)
 from ray.rllib.core.rl_trainer.rl_trainer import (
     RLTrainer,
     ParamOptimizerPairs,
@@ -165,8 +169,7 @@ def add_module(
         self,
         *,
         module_id: ModuleID,
-        module_cls: Type[RLModule],
-        module_kwargs: Mapping[str, Any],
+        module_spec: SingleAgentRLModuleSpec,
         set_optimizer_fn: Optional[Callable[[RLModule], ParamOptimizerPairs]] = None,
         optimizer_cls: Optional[Type[Optimizer]] = None,
     ) -> None:
@@ -174,8 +177,7 @@ def add_module(
 
         Args:
             module_id: The id of the module to add.
-            module_cls: The module class to add.
-            module_kwargs: The config for the module.
+            module_spec:  #TODO (Kourosh) fill in here.
             set_optimizer_fn: A function that takes in the module and returns a list of
                 (param, optimizer) pairs. Each element in the tuple describes a
                 parameter group that share the same optimizer object, if None, the
@@ -189,8 +191,7 @@ def add_module(
             for worker in self._workers:
                 ref = worker.add_module.remote(
                     module_id=module_id,
-                    module_cls=module_cls,
-                    module_kwargs=module_kwargs,
+                    module_spec=module_spec,
                     set_optimizer_fn=set_optimizer_fn,
                     optimizer_cls=optimizer_cls,
                 )
@@ -199,8 +200,7 @@ def add_module(
         else:
             self._trainer.add_module(
                 module_id=module_id,
-                module_cls=module_cls,
-                module_kwargs=module_kwargs,
+                module_spec=module_spec,
                 set_optimizer_fn=set_optimizer_fn,
                 optimizer_cls=optimizer_cls,
             )
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 4493665e0091..6530f04680ef 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -121,11 +121,6 @@ def add_module_to_runner_or_trainer(
 ):
     runner_or_trainer.add_module(
         module_id=module_id,
-        module_cls=get_module_class(framework),
-        module_kwargs={
-            "observation_space": env.observation_space,
-            "action_space": env.action_space,
-            "model_config": {"hidden_dim": 32},
-        },
+        module_spec=get_module_spec(framework, env, is_multi_agent=False),
         optimizer_cls=get_optimizer_default_class(framework),
     )

From 7aaee1857158a840c6097abefc8ad7886d7769bc Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 23 Jan 2023 22:46:57 -0800
Subject: [PATCH 051/112] get_trainer_runner_config() now gets an optional
 ModuleSpec object

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py                 | 27 +++--
 rllib/algorithms/algorithm_config.py          | 31 ++++--
 .../tests/test_trainer_runner_config.py       | 16 +--
 .../core/rl_trainer/trainer_runner_config.py  | 99 +++++++------------
 4 files changed, 77 insertions(+), 96 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 34b071910e30..1c3802cbf116 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -674,22 +674,21 @@ def setup(self, config: AlgorithmConfig) -> None:
             # Need to add back method_type in case Algorithm is restored from checkpoint
             method_config["type"] = method_type
 
+        if self.config._enable_rl_module_api:
+            local_worker = self.workers.local_worker()
+            marl_config = {"modules": {}}
+
+            for pid, policy in local_worker.policy_map.items():
+                marl_config["modules"][pid] = {
+                    "module_class": policy.config["rl_module_class"],
+                    "observation_space": policy.observation_space,
+                    "action_space": policy.action_space,
+                    "model_config": policy.config["model"],
+                }
 
-        local_worker = self.workers.local_worker()
-        marl_config = {"modules": {}}
-
-        breakpoint()
-        for pid, policy in local_worker.policy_map:
-            marl_config["modules"][pid] = {
-                "module_class": policy.config["rl_module_class"],
-                "observation_space": policy.observation_space,
-                "action_space": policy.action_space,
-                "model_config": policy.config["model"]
-            }
+            from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
 
-        from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
-        marl = MultiAgentRLModule.from_multi_agent_config(marl_config)
-        breakpoint()
+            marl = MultiAgentRLModule.from_multi_agent_config(marl_config)
 
         self.trainer_runner = None
         if self.config._enable_rl_trainer_api:
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index cd720babb0ca..762d2244d805 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -16,7 +16,11 @@
 
 import ray
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
-from ray.rllib.core.rl_trainer import TrainerRunnerConfig
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
+from ray.rllib.core.rl_trainer.trainer_runner_config import (
+    TrainerRunnerConfig,
+    ModuleSpec,
+)
 from ray.rllib.evaluation.rollout_worker import RolloutWorker
 from ray.rllib.env.env_context import EnvContext
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
@@ -2607,9 +2611,25 @@ def get_default_rl_trainer_class(self) -> Union[Type["RLTrainer"], str]:
         raise NotImplementedError
 
     def get_trainer_runner_config(
-        self, observation_space: Space, action_space: Space
+        self, module_spec: Optional[ModuleSpec] = None
     ) -> TrainerRunnerConfig:
 
+        if module_spec is None:
+            module_spec = SingleAgentRLModuleSpec()
+
+        if isinstance(module_spec, SingleAgentRLModuleSpec):
+            if module_spec.module_class is None:
+                module_spec.module_class = self.rl_module_class
+
+            if module_spec.observation_space is None:
+                module_spec.observation_space = self.observation_space
+
+            if module_spec.action_space is None:
+                module_spec.action_space = self.action_space
+
+            if module_spec.model_config is None:
+                module_spec.model_config = self.model
+
         if not self._is_frozen:
             raise ValueError(
                 "Cannot call `get_trainer_runner_config()` on an unfrozen "
@@ -2618,12 +2638,7 @@ def get_trainer_runner_config(
 
         config = (
             TrainerRunnerConfig()
-            .module(
-                module_class=self.rl_module_class,
-                observation_space=observation_space,
-                action_space=action_space,
-                model_config=self.model,
-            )
+            .module(module_spec)
             .trainer(
                 trainer_class=self.rl_trainer_class,
                 eager_tracing=self.eager_tracing,
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_config.py b/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
index 5c8202a9890a..acdc67731337 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
@@ -2,10 +2,13 @@
 import unittest
 
 import ray
+
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.core.rl_trainer.trainer_runner_config import TrainerRunnerConfig
 from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
-from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core.testing.utils import get_module_spec
 
 
 class TestAlgorithmConfig(unittest.TestCase):
@@ -24,12 +27,7 @@ def test_trainer_runner_build(self):
 
         config = (
             TrainerRunnerConfig()
-            .module(
-                module_class=DiscreteBCTFModule,
-                observation_space=env.observation_space,
-                action_space=env.action_space,
-                model_config={"hidden_dim": 32},
-            )
+            .module(get_module_spec("tf", env))
             .trainer(
                 trainer_class=BCTfRLTrainer,
             )
@@ -50,7 +48,9 @@ def test_trainer_runner_build_from_algorithm_config(self):
         )
         config.freeze()
         runner_config = config.get_trainer_runner_config(
-            env.observation_space, env.action_space
+            SingleAgentRLModuleSpec(
+                observation_space=env.observation_space, action_space=env.action_space
+            )
         )
         runner_config.build()
 
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index 4b04bead3fce..79a821173d2d 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -1,4 +1,6 @@
 from typing import Type, Optional, TYPE_CHECKING, Union, Dict
+from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.utils.from_config import NotProvided
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 
@@ -8,6 +10,7 @@
     from ray.rllib.core.rl_trainer import RLTrainer
     import gymnasium as gym
 
+ModuleSpec = Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
 
 # TODO (Kourosh): We should make all configs come from a standard base class that
 # defines the general interfaces for validation, from_dict, to_dict etc.
@@ -20,11 +23,7 @@ def __init__(self, cls: Type[TrainerRunner] = None) -> None:
         self.trainer_runner_class = cls or TrainerRunner
 
         # `self.module()`
-        self.module_obj = None
-        self.module_class = None
-        self.observation_space = None
-        self.action_space = None
-        self.model_config = None
+        self.module_spec = None
 
         # `self.trainer()`
         self.trainer_class = None
@@ -40,30 +39,12 @@ def __init__(self, cls: Type[TrainerRunner] = None) -> None:
 
     def validate(self) -> None:
 
-        if self.module_class is None and self.module_obj is None:
+        if self.module_spec is None:
             raise ValueError(
-                "Cannot initialize an RLTrainer without an RLModule. Please provide "
-                "the RLModule class with .module(module_class=MyModuleClass) or "
-                "an RLModule instance with .module(module=MyModuleInstance)."
+                "Cannot initialize an RLTrainer without the module specs. Please provide "
+                "the specs via .module(module_spec)."
             )
 
-        if self.module_class is not None:
-            if self.observation_space is None:
-                raise ValueError(
-                    "Must provide observation_space for RLModule when RLModule class "
-                    "is provided. Use .module(observation_space=MySpace)."
-                )
-            if self.action_space is None:
-                raise ValueError(
-                    "Must provide action_space for RLModule when RLModule class "
-                    "is provided. Use .module(action_space=MySpace)."
-                )
-            if self.model_config is None:
-                raise ValueError(
-                    "Must provide model_config for RLModule when RLModule class "
-                    "is provided. Use .module(model_config=MyConfig)."
-                )
-
         if self.trainer_class is None:
             raise ValueError(
                 "Cannot initialize an RLTrainer without an RLTrainer. Please provide "
@@ -86,17 +67,15 @@ def validate(self) -> None:
 
     def build(self) -> TrainerRunner:
         self.validate()
+
+        # if the module class is a multi agent class it will override the default MultiAgentRLModule class
+        # otherwise, it will be a single agent wrapped with mutliagent
         # TODO (Kourosh): What should be scaling_config? it's not clear what
         # should be passed in as trainer_config and what will be inferred
         return self.trainer_runner_class(
             trainer_class=self.trainer_class,
             trainer_config={
-                "module_class": self.module_class,
-                "module_kwargs": {
-                    "observation_space": self.observation_space,
-                    "action_space": self.action_space,
-                    "model_config": self.model_config,
-                },
+                "module_spec": self.module_spec,
                 # TODO (Kourosh): should this be inferred inside the constructor?
                 "distributed": self.num_gpus > 1,
                 # TODO (Avnish): add this
@@ -119,32 +98,33 @@ def algorithm(
         return self
 
     def module(
+        self,
+        module_spec: Optional[ModuleSpec] = NotProvided,
+    ) -> "TrainerRunnerConfig":
+
+        if module_spec is not NotProvided:
+            self.module_spec = module_spec
+
+        return self
+
+    def multi_agent(
         self,
         *,
-        module_class: Optional[Type["RLModule"]] = NotProvided,
-        observation_space: Optional["gym.Space"] = NotProvided,
-        action_space: Optional["gym.Space"] = NotProvided,
-        model_config: Optional[dict] = NotProvided,
-        module: Optional["RLModule"] = NotProvided,
+        modules: Optional[Dict[str, "SingleAgentRLModuleConfig"]] = NotProvided,
+        marl_module_class: Optional[Type["MultiAgentRLModule"]] = NotProvided,
     ) -> "TrainerRunnerConfig":
+        pass
 
-        if module is NotProvided and module_class is NotProvided:
-            raise ValueError(
-                "Must provide either module or module_class. Please provide "
-                "the RLModule class with .module(module=MyModule) or "
-                ".module(module_class=MyModuleClass)."
-            )
+    def resources(
+        self,
+        num_gpus: Optional[Union[float, int]] = NotProvided,
+        fake_gpus: Optional[bool] = NotProvided,
+    ) -> "TrainerRunnerConfig":
 
-        if module_class is not NotProvided:
-            self.module_class = module_class
-        if observation_space is not NotProvided:
-            self.observation_space = observation_space
-        if action_space is not NotProvided:
-            self.action_space = action_space
-        if model_config is not NotProvided:
-            self.model_config = model_config
-        if module is not NotProvided:
-            self.module_obj = module
+        if num_gpus is not NotProvided:
+            self.num_gpus = num_gpus
+        if fake_gpus is not NotProvided:
+            self.fake_gpus = fake_gpus
 
         return self
 
@@ -164,16 +144,3 @@ def trainer(
             self.optimizer_config = optimizer_config
 
         return self
-
-    def resources(
-        self,
-        num_gpus: Optional[Union[float, int]] = NotProvided,
-        fake_gpus: Optional[bool] = NotProvided,
-    ) -> "TrainerRunnerConfig":
-
-        if num_gpus is not NotProvided:
-            self.num_gpus = num_gpus
-        if fake_gpus is not NotProvided:
-            self.fake_gpus = fake_gpus
-
-        return self

From 7c831d39c5e582bac5ba1ad83f6b0ef3a8e8757a Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 23 Jan 2023 23:02:33 -0800
Subject: [PATCH 052/112] Algorithm can now construct the trainer_runner based
 on the policy_maps created on local_worker

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py     | 33 ++++++++++++++++++-------------
 rllib/core/rl_module/rl_module.py | 17 ++++++++++++++++
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 1c3802cbf116..b5672c32f1fb 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -35,6 +35,9 @@
 import ray.cloudpickle as pickle
 
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
+from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec, MultiAgentRLModule
+
 from ray.rllib.connectors.agent.obs_preproc import ObsPreprocessorConnector
 from ray.rllib.algorithms.registry import ALGORITHMS as ALL_ALGORITHMS
 from ray.rllib.env.env_context import EnvContext
@@ -674,25 +677,27 @@ def setup(self, config: AlgorithmConfig) -> None:
             # Need to add back method_type in case Algorithm is restored from checkpoint
             method_config["type"] = method_type
 
-        if self.config._enable_rl_module_api:
+        self.trainer_runner = None
+        if self.config._enable_rl_trainer_api:
+            # TODO (Kourosh): This is an interim solution where policies and modules co-exist. 
+            # In this world we have both policy_map and MARLModule that need to be 
+            # consistent with one another. To make a consistent parity between the two we 
+            # need to loop throught the policy modules and create a simple MARLModule 
+            # from the RLModule within each policy.
             local_worker = self.workers.local_worker()
-            marl_config = {"modules": {}}
+            module_specs = {}
 
             for pid, policy in local_worker.policy_map.items():
-                marl_config["modules"][pid] = {
-                    "module_class": policy.config["rl_module_class"],
-                    "observation_space": policy.observation_space,
-                    "action_space": policy.action_space,
-                    "model_config": policy.config["model"],
-                }
-
-            from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
+                module_specs[pid] = SingleAgentRLModuleSpec(
+                    module_class=policy.config["rl_module_class"],
+                    observation_space=policy.observation_space,
+                    action_space=policy.action_space,
+                    model_config=policy.config["model"],
+                )
 
-            marl = MultiAgentRLModule.from_multi_agent_config(marl_config)
+            module_spec = MultiAgentRLModuleSpec(module_class=MultiAgentRLModule, module_specs=module_specs)
 
-        self.trainer_runner = None
-        if self.config._enable_rl_trainer_api:
-            trainer_runner_config = self.config.get_trainer_runner_config()
+            trainer_runner_config = self.config.get_trainer_runner_config(module_spec)
             self.trainer_runner = trainer_runner_config.build()
 
         # Run `on_algorithm_init` callback after initialization is done.
diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py
index 3d60fb62ee58..430815cf2314 100644
--- a/rllib/core/rl_module/rl_module.py
+++ b/rllib/core/rl_module/rl_module.py
@@ -48,6 +48,23 @@ def build(self) -> "RLModule":
             model_config=self.model_config,
         )
 
+@ExperimentalAPI
+@dataclass
+class RLModuleConfig:
+    """Configuration for the PPO module.
+    # TODO (Kourosh): Whether we need this or not really depends on how the catalog
+    # design end up being.
+    Attributes:
+        observation_space: The observation space of the environment.
+        action_space: The action space of the environment.
+        max_seq_len: Max seq len for training an RNN model.
+        (TODO (Kourosh) having max_seq_len here seems a bit unnatural, can we rethink
+        this design?)
+    """
+
+    observation_space: gym.Space = None
+    action_space: gym.Space = None
+    max_seq_len: int = None
 
 @ExperimentalAPI
 class RLModule(abc.ABC):

From d3d610e6f717b3963db5771e1f979a2e40e8b755 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 23 Jan 2023 23:11:13 -0800
Subject: [PATCH 053/112] lint and clean up

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py                 | 19 +++++++++++-------
 rllib/core/rl_module/marl_module.py           |  6 +++---
 rllib/core/rl_module/rl_module.py             |  2 ++
 .../core/rl_trainer/tests/test_rl_trainer.py  |  1 -
 .../tests/test_trainer_runner_local.py        |  5 +----
 .../core/rl_trainer/trainer_runner_config.py  | 20 ++++++-------------
 rllib/core/testing/utils.py                   |  5 +++--
 7 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index b5672c32f1fb..5f68a2850589 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -36,7 +36,10 @@
 
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
-from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec, MultiAgentRLModule
+from ray.rllib.core.rl_module.marl_module import (
+    MultiAgentRLModuleSpec,
+    MultiAgentRLModule,
+)
 
 from ray.rllib.connectors.agent.obs_preproc import ObsPreprocessorConnector
 from ray.rllib.algorithms.registry import ALGORITHMS as ALL_ALGORITHMS
@@ -679,11 +682,11 @@ def setup(self, config: AlgorithmConfig) -> None:
 
         self.trainer_runner = None
         if self.config._enable_rl_trainer_api:
-            # TODO (Kourosh): This is an interim solution where policies and modules co-exist. 
-            # In this world we have both policy_map and MARLModule that need to be 
-            # consistent with one another. To make a consistent parity between the two we 
-            # need to loop throught the policy modules and create a simple MARLModule 
-            # from the RLModule within each policy.
+            # TODO (Kourosh): This is an interim solution where policies and modules
+            # co-exist. In this world we have both policy_map and MARLModule that need
+            # to be consistent with one another. To make a consistent parity between
+            # the two we need to loop throught the policy modules and create a simple
+            # MARLModule from the RLModule within each policy.
             local_worker = self.workers.local_worker()
             module_specs = {}
 
@@ -695,7 +698,9 @@ def setup(self, config: AlgorithmConfig) -> None:
                     model_config=policy.config["model"],
                 )
 
-            module_spec = MultiAgentRLModuleSpec(module_class=MultiAgentRLModule, module_specs=module_specs)
+            module_spec = MultiAgentRLModuleSpec(
+                module_class=MultiAgentRLModule, module_specs=module_specs
+            )
 
             trainer_runner_config = self.config.get_trainer_runner_config(module_spec)
             self.trainer_runner = trainer_runner_config.build()
diff --git a/rllib/core/rl_module/marl_module.py b/rllib/core/rl_module/marl_module.py
index a6b625a9b382..9319795184e1 100644
--- a/rllib/core/rl_module/marl_module.py
+++ b/rllib/core/rl_module/marl_module.py
@@ -155,7 +155,8 @@ def from_multi_agent_config(cls, config: Mapping[str, Any]) -> "MultiAgentRLModu
     def __check_module_configs(cls, module_configs: Dict[ModuleID, Any]):
         """Checks the module configs for validity.
 
-        The module_configs be a mapping from module_ids to SingleAgentRLModuleSpec objects.
+        The module_configs be a mapping from module_ids to SingleAgentRLModuleSpec
+        objects.
 
         Args:
             module_configs: The module configs to check.
@@ -166,8 +167,7 @@ def __check_module_configs(cls, module_configs: Dict[ModuleID, Any]):
         for module_id, module_spec in module_configs.items():
             if not isinstance(module_spec, SingleAgentRLModuleSpec):
                 raise ValueError(
-                    f"Module config for module_id {module_id} is missing "
-                    f"required key {module_key}."
+                    f"Module {module_id} is not a SingleAgentRLModuleSpec object."
                 )
 
     def keys(self) -> Iterator[ModuleID]:
diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py
index 430815cf2314..26dc32ced2a1 100644
--- a/rllib/core/rl_module/rl_module.py
+++ b/rllib/core/rl_module/rl_module.py
@@ -48,6 +48,7 @@ def build(self) -> "RLModule":
             model_config=self.model_config,
         )
 
+
 @ExperimentalAPI
 @dataclass
 class RLModuleConfig:
@@ -66,6 +67,7 @@ class RLModuleConfig:
     action_space: gym.Space = None
     max_seq_len: int = None
 
+
 @ExperimentalAPI
 class RLModule(abc.ABC):
     """Base class for RLlib modules.
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index f6c6c954c554..8af9b0ae7245 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -11,7 +11,6 @@
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-from ray.rllib.core.testing.utils import add_module_to_runner_or_trainer
 
 
 def get_trainer(distributed=False) -> RLTrainer:
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
index 3058f3c86f35..9986cf98dd3d 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
@@ -1,14 +1,11 @@
 import gymnasium as gym
 import unittest
 
-from ray.rllib.utils.framework import try_import_tf
 import ray
 
-from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
-from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
-from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
+from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.core.testing.utils import (
     add_module_to_runner_or_trainer,
     get_trainer_runner,
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index 79a821173d2d..d193e7cbb1f8 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -6,12 +6,11 @@
 
 if TYPE_CHECKING:
     from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
-    from ray.rllib.core.rl_module import RLModule
     from ray.rllib.core.rl_trainer import RLTrainer
-    import gymnasium as gym
 
 ModuleSpec = Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
 
+
 # TODO (Kourosh): We should make all configs come from a standard base class that
 # defines the general interfaces for validation, from_dict, to_dict etc.
 class TrainerRunnerConfig:
@@ -41,8 +40,8 @@ def validate(self) -> None:
 
         if self.module_spec is None:
             raise ValueError(
-                "Cannot initialize an RLTrainer without the module specs. Please provide "
-                "the specs via .module(module_spec)."
+                "Cannot initialize an RLTrainer without the module specs. "
+                "Please provide the specs via .module(module_spec)."
             )
 
         if self.trainer_class is None:
@@ -68,8 +67,9 @@ def validate(self) -> None:
     def build(self) -> TrainerRunner:
         self.validate()
 
-        # if the module class is a multi agent class it will override the default MultiAgentRLModule class
-        # otherwise, it will be a single agent wrapped with mutliagent
+        # If the module class is a multi agent class it will override the default
+        # MultiAgentRLModule class. otherwise, it will be a single agent wrapped with
+        # mutliagent
         # TODO (Kourosh): What should be scaling_config? it's not clear what
         # should be passed in as trainer_config and what will be inferred
         return self.trainer_runner_class(
@@ -107,14 +107,6 @@ def module(
 
         return self
 
-    def multi_agent(
-        self,
-        *,
-        modules: Optional[Dict[str, "SingleAgentRLModuleConfig"]] = NotProvided,
-        marl_module_class: Optional[Type["MultiAgentRLModule"]] = NotProvided,
-    ) -> "TrainerRunnerConfig":
-        pass
-
     def resources(
         self,
         num_gpus: Optional[Union[float, int]] = NotProvided,
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 6530f04680ef..d423621076da 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -5,7 +5,7 @@
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 
-from rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
+from rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec, MultiAgentRLModule
 from rllib.core.rl_module.tests.test_marl_module import DEFAULT_POLICY_ID
 
 if TYPE_CHECKING:
@@ -59,7 +59,8 @@ def get_module_spec(framework: str, env: "gym.Env", is_multi_agent: bool = False
     )
 
     if is_multi_agent:
-        # TODO (Kourosh): Make this more multi-agent for example with policy ids "1", and "2".
+        # TODO (Kourosh): Make this more multi-agent for example with policy ids "1",
+        # and "2".
         return MultiAgentRLModuleSpec(
             module_class=MultiAgentRLModule, module_specs={DEFAULT_POLICY_ID: spec}
         )

From 97acdb7afef60f51fa0df9e84e669bef7e0f8e9c Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 24 Jan 2023 08:58:52 -0800
Subject: [PATCH 054/112] fixed imports

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/testing/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index d423621076da..bd96492bac50 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -5,8 +5,11 @@
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 
-from rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec, MultiAgentRLModule
-from rllib.core.rl_module.tests.test_marl_module import DEFAULT_POLICY_ID
+from ray.rllib.core.rl_module.marl_module import (
+    MultiAgentRLModuleSpec,
+    MultiAgentRLModule,
+)
+from ray.rllib.core.rl_module.tests.test_marl_module import DEFAULT_POLICY_ID
 
 if TYPE_CHECKING:
     import gymnasium as gym

From 93f3ce96ab3a861c876212eddae4d6b17a9d9aba Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 24 Jan 2023 10:21:51 -0800
Subject: [PATCH 055/112] fixed the unittest for ppo_rl_trainer

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
index 2f2b09520760..7ce198bb0c2b 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
@@ -2,6 +2,7 @@
 import ray
 import unittest
 import numpy as np
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 import torch
 import tree # pip install dm-tree
 
@@ -88,7 +89,7 @@ def test_loss(self):
         config.validate()
         config.freeze()
         trainer_runner_config = config.get_trainer_runner_config(
-            policy.observation_space, policy.action_space
+            SingleAgentRLModuleSpec(observation_space=policy.observation_space, action_space=policy.action_space)
         )
         trainer_runner = trainer_runner_config.build()
 

From c58293a0d54885767a753ba3bf4f5968b3ea56cc Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 24 Jan 2023 17:56:05 -0800
Subject: [PATCH 056/112] got the PPO POC running

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py                 |  4 ++
 rllib/algorithms/ppo/ppo.py                   | 49 ++++++++++++++-----
 .../ppo/tests/test_ppo_rl_trainer.py          | 20 ++++----
 .../ppo/torch/ppo_torch_rl_trainer.py         | 14 ++++--
 rllib/core/rl_trainer/rl_trainer.py           |  8 +++
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  8 +++
 .../core/rl_trainer/torch/torch_rl_trainer.py | 19 ++++---
 rllib/core/rl_trainer/trainer_runner.py       | 15 ++++++
 8 files changed, 103 insertions(+), 34 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 5f68a2850589..2264887e9cb4 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -705,6 +705,10 @@ def setup(self, config: AlgorithmConfig) -> None:
             trainer_runner_config = self.config.get_trainer_runner_config(module_spec)
             self.trainer_runner = trainer_runner_config.build()
 
+            weights = local_worker.get_weights()
+            # we need to create marl module weights
+            self.trainer_runner.set_weights(weights)
+
         # Run `on_algorithm_init` callback after initialization is done.
         self.callbacks.on_algorithm_init(algorithm=self)
 
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 462bae0d7e54..b63e74c6ffc9 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -42,6 +42,8 @@
 
 if TYPE_CHECKING:
     from ray.rllib.core.rl_module import RLModule
+    from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
+
 
 logger = logging.getLogger(__name__)
 
@@ -135,7 +137,7 @@ def get_default_rl_module_class(self) -> Union[Type["RLModule"], str]:
     def get_default_rl_trainer_class(self) -> Union[Type["RLTrainer"], str]:
         if self.framework_str == "torch":
             from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_trainer import (
-                PPOTorchRLTrainer
+                PPOTorchRLTrainer,
             )
 
             return PPOTorchRLTrainer
@@ -377,14 +379,31 @@ def training_step(self) -> ResultDict:
         train_batch = standardize_fields(train_batch, ["advantages"])
         # Train
         if self.config._enable_rl_trainer_api:
-            train_results = self.trainer_runner.update(train_batch)
+            # TODO (Kourosh) Clearly define what train_batch_size
+            # vs. sgd_minibatch_size and num_sgd_iter is.
+            # TODO (Kourosh) Do this inside the RL Trainer so
+            # that we don't have to this back and forth
+            # communication between driver and the remote
+            # trainer workers
+            for epoch in range(self.config.num_sgd_iter):
+                # bsize = self.config.sgd_minibatch_size
+                # for minibatch in SampleBatchLoader(train_batch, batch_size=bsize):
+                #     train_results = self.trainer_runner.update(minibatch)
+                # TODO (Kourosh) The output of trainer_runner.update() should be
+                # one item not a list of items
+                train_results = self.trainer_runner.update(train_batch)[0]
         elif self.config.simple_optimizer:
             train_results = train_one_step(self, train_batch)
         else:
             train_results = multi_gpu_train_one_step(self, train_batch)
 
-        policies_to_update = list(train_results.keys())
+        if self.config._enable_rl_trainer_api:
+            policies_to_update = set(train_results["loss"].keys()) - {"total_loss"}
+        else:
+            policies_to_update = list(train_results.keys())
 
+        # TODO (Kourosh): num_grad_updates per each policy should be accessible via
+        # train_results
         global_vars = {
             "timestep": self._counters[NUM_AGENT_STEPS_SAMPLED],
             "num_grad_updates_per_policy": {
@@ -395,24 +414,32 @@ def training_step(self) -> ResultDict:
 
         # Update weights - after learning on the local worker - on all remote
         # workers.
-        if self.workers.num_remote_workers() > 0:
-            with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
-                from_worker = None
-                if self.config._enable_rl_trainer_api:
-                    from_worker = self.trainer_runner
+        with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
+            from_worker = None
+            if self.config._enable_rl_trainer_api:
+                # sync from trainer_runner to all rollout workers
+                from_worker = self.trainer_runner
+
+            if self.workers.num_remote_workers() > 0:
                 self.workers.sync_weights(
                     from_worker=from_worker,
                     policies=list(train_results.keys()),
                     global_vars=global_vars,
                 )
+            elif self.config._enable_rl_trainer_api:
+                weights = self.trainer_runner.get_weights()
+                self.workers.local_worker().set_weights(weights)
 
         if self.config._enable_rl_trainer_api:
             kl_dict = {
-                pid: pinfo[LEARNER_STATS_KEY].get("kl")
-                for pid, pinfo in train_results.items()
+                pid: train_results["loss"][pid].get("mean_kl_loss")
+                for pid in policies_to_update
             }
             # triggers a special update method on RLOptimizer to update the KL values.
-            self.trainer_runner.additional_update(kl_values=kl_dict)
+            self.trainer_runner.additional_update(
+                sampled_kl_values=kl_dict,
+                timestep=self._counters[NUM_AGENT_STEPS_SAMPLED],
+            )
 
             return train_results
 
diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
index 7ce198bb0c2b..c1faec1f2f31 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
@@ -1,10 +1,9 @@
-
 import ray
 import unittest
 import numpy as np
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 import torch
-import tree # pip install dm-tree
+import tree  # pip install dm-tree
 
 import ray.rllib.algorithms.ppo as ppo
 from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_trainer import PPOTorchRLTrainer
@@ -29,9 +28,9 @@
     SampleBatch.TERMINATEDS: np.array([False, False, True]),
     SampleBatch.TRUNCATEDS: np.array([False, False, False]),
     SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32),
-    SampleBatch.ACTION_DIST_INPUTS: {"logits": np.array(
-        [[-2.0, 0.5], [-3.0, -0.3], [-0.1, 2.5]], dtype=np.float32
-    )},
+    SampleBatch.ACTION_DIST_INPUTS: {
+        "logits": np.array([[-2.0, 0.5], [-3.0, -0.3], [-0.1, 2.5]], dtype=np.float32)
+    },
     SampleBatch.ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32),
     SampleBatch.EPS_ID: np.array([0, 0, 0]),
     SampleBatch.AGENT_INDEX: np.array([0, 0, 0]),
@@ -47,7 +46,6 @@ def setUpClass(cls):
     def tearDownClass(cls):
         ray.shutdown()
 
-
     def test_loss(self):
 
         config = (
@@ -83,13 +81,14 @@ def test_loss(self):
 
         policy_loss = policy.loss(policy.model, policy.dist_class, train_batch)
 
-        config.training(
-            _enable_rl_trainer_api=True
-        )
+        config.training(_enable_rl_trainer_api=True)
         config.validate()
         config.freeze()
         trainer_runner_config = config.get_trainer_runner_config(
-            SingleAgentRLModuleSpec(observation_space=policy.observation_space, action_space=policy.action_space)
+            SingleAgentRLModuleSpec(
+                observation_space=policy.observation_space,
+                action_space=policy.action_space,
+            )
         )
         trainer_runner = trainer_runner_config.build()
 
@@ -104,7 +103,6 @@ def test_loss(self):
         check(trainer_runner_loss, policy_loss)
 
 
-
 if __name__ == "__main__":
     import pytest
     import sys
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index 6aab20373d89..adc5dbd9b932 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -1,9 +1,9 @@
 import logging
-from typing import Mapping, Any, Union
+from typing import Mapping, Any
 
 from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer
 from ray.rllib.evaluation.postprocessing import Postprocessing
-from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.torch_utils import (
     explained_variance,
@@ -14,11 +14,13 @@
 
 logger = logging.getLogger(__name__)
 
+
 class PPOTorchRLTrainer(TorchRLTrainer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO (Kourosh): Move these failures to config.validate() or support them.
+        self.entropy_coeff_scheduler = None
         if self.config.entropy_coeff_schedule:
             raise ValueError("entropy_coeff_schedule is not supported in RLTrainer yet")
 
@@ -46,7 +48,9 @@ def _compute_loss_per_module(
 
         curr_action_dist = fwd_out[SampleBatch.ACTION_DIST]
         action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST])
-        prev_action_dist = action_dist_class(**batch[SampleBatch.ACTION_DIST_INPUTS].asdict())
+        prev_action_dist = action_dist_class(
+            **batch[SampleBatch.ACTION_DIST_INPUTS].asdict()
+        )
 
         logp_ratio = torch.exp(
             fwd_out[SampleBatch.ACTION_LOGP] - batch[SampleBatch.ACTION_LOGP]
@@ -123,9 +127,9 @@ def _additional_update_per_module(
         # TODO (Kourosh): We may want to index into the schedulers to get the right one
         # for this module
         if self.entropy_coeff_scheduler is not None:
-            self.entropy_coeff_scheduleler.update(timestep)
+            self.entropy_coeff_scheduler.update(timestep)
 
         if self.lr_scheduler is not None:
-            self.lr_scheduleler.update(timestep)
+            self.lr_scheduler.update(timestep)
 
         return results
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 8969cab69a1e..e3e8bbc2e475 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -408,6 +408,14 @@ def apply_gradients(self, gradients: Dict[ParamRef, TensorType]) -> None:
             gradients: A dictionary of gradients.
         """
 
+    @abc.abstractmethod
+    def get_weights(self) -> Mapping[str, Any]:
+        """Returns the state of the underlying MultiAgentRLModule"""
+
+    @abc.abstractmethod
+    def set_weights(self, weights: Mapping[str, Any]) -> None:
+        """Sets the state of the underlying MultiAgentRLModule"""
+
     def set_state(self, state: Mapping[str, Any]) -> None:
         """Set the state of the trainer.
 
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 44a8f6d21581..d17ac72a3faa 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -264,6 +264,14 @@ def convert_batch_to_tf_tensor(self, batch: MultiAgentBatch) -> NestedDict:
             batch[key] = tf.convert_to_tensor(value, dtype=tf.float32)
         return batch
 
+    def get_weights(self) -> Mapping[str, Any]:
+        # TODO (Kourosh) Implement this.
+        raise NotImplementedError
+
+    def set_weights(self, weights: Mapping[str, Any]) -> None:
+        # TODO (Kourosh) Implement this.
+        raise NotImplementedError
+
     @override(RLTrainer)
     def get_parameters(self, module: RLModule) -> Sequence[ParamType]:
         return module.trainable_variables
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index b8313b47a584..ee0caa219c1c 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -30,6 +30,7 @@
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.framework import try_import_torch
@@ -157,13 +158,8 @@ def _make_distributed_module(self) -> MultiAgentRLModule:
 
     @override(RLTrainer)
     def _convert_batch_type(self, batch: MultiAgentBatch):
-        batch = NestedDict(batch.policy_batches)
-        batch = NestedDict(
-            {
-                k: torch.as_tensor(v, dtype=torch.float32, device=self._device)
-                for k, v in batch.items()
-            }
-        )
+        batch = convert_to_torch_tensor(batch.policy_batches, device=self._device)
+        batch = NestedDict(batch)
         return batch
 
     @override(RLTrainer)
@@ -171,6 +167,15 @@ def do_distributed_update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         # in torch the distributed update is no different than the normal update
         return self._update(batch)
 
+    def get_weights(self) -> Mapping[str, Any]:
+        """Returns the state of the underlying MultiAgentRLModule"""
+        return self._module.get_state()
+
+    def set_weights(self, weights: Mapping[str, Any]) -> None:
+        """Sets the state of the underlying MultiAgentRLModule"""
+        weights = convert_to_torch_tensor(weights, device=self._device)
+        return self._module.set_state(weights)
+
     @override(RLTrainer)
     def get_param_ref(self, param: ParamType) -> Hashable:
         return param
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index c6fcda3deacf..1faef6271aea 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -230,6 +230,21 @@ def get_weight(self) -> Dict:
         # TODO (Avnish): implement this.
         pass
 
+    def set_weights(self, weights) -> None:
+        # TODO (Kourosh) Set / get weight has to be thoroughly
+        # tested across actors and multi-gpus
+        if self._distributed:
+            ray.get([worker.set_weights(weights) for worker in self._workers])
+        else:
+            self._trainer.set_weights(weights)
+
+    def get_weights(self) -> Mapping[str, Any]:
+        if self._distributed:
+            worker = next(iter(self._workers))
+            return ray.get(worker.get_weights())
+        else:
+            return self._trainer.get_weights()
+
     def get_state(self) -> List[Mapping[ModuleID, Mapping[str, Any]]]:
         """Get the states of the RLTrainers"""
         if self._distributed:

From 77ff5850f307cc796ec82d2a5bb16ef3590e46db Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 24 Jan 2023 18:05:58 -0800
Subject: [PATCH 057/112] lint

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py                     | 2 +-
 rllib/algorithms/ppo/ppo.py                       | 9 ++++-----
 rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py | 1 -
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 2813e3c90af6..b42cb2b4bf04 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -714,8 +714,8 @@ def setup(self, config: AlgorithmConfig) -> None:
             trainer_runner_config = self.config.get_trainer_runner_config(module_spec)
             self.trainer_runner = trainer_runner_config.build()
 
+            # sync the weights from rollout workers to trainers
             weights = local_worker.get_weights()
-            # we need to create marl module weights
             self.trainer_runner.set_weights(weights)
 
         # Run `on_algorithm_init` callback after initialization is done.
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index b63e74c6ffc9..72f65a49a047 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -415,12 +415,11 @@ def training_step(self) -> ResultDict:
         # Update weights - after learning on the local worker - on all remote
         # workers.
         with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
-            from_worker = None
-            if self.config._enable_rl_trainer_api:
-                # sync from trainer_runner to all rollout workers
-                from_worker = self.trainer_runner
-
             if self.workers.num_remote_workers() > 0:
+                from_worker = None
+                if self.config._enable_rl_trainer_api:
+                    # sync weights from trainer_runner to all rollout workers
+                    from_worker = self.trainer_runner
                 self.workers.sync_weights(
                     from_worker=from_worker,
                     policies=list(train_results.keys()),
diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
index c1faec1f2f31..ee5362ca06bf 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
@@ -6,7 +6,6 @@
 import tree  # pip install dm-tree
 
 import ray.rllib.algorithms.ppo as ppo
-from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_trainer import PPOTorchRLTrainer
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.torch_utils import convert_to_torch_tensor
 from ray.rllib.utils.test_utils import check

From 87bda0181a8a1625685d4dc8ec3b7c62700a4272 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 24 Jan 2023 21:55:44 -0800
Subject: [PATCH 058/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/trainer_runner.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 1faef6271aea..2d9951e248e6 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -98,8 +98,25 @@ def __init__(
             self._trainer = trainer_class(**trainer_config)
             self._trainer.build()
 
+
+    def fit(self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int) -> Mapping[str, Any]:
+        """Do `num_iters` minibatch updates given the original batch.
+        
+        Given a batch of episodes you can use this method to take more 
+        than one backward pass on the batch. 
+
+        Args:
+            batch: 
+            minibatch_size:
+            num_iters: 
+        
+        Returns:
+            The training statistics of this fitting round. 
+        """
+        
+
     def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
-        """Do a gradient based update to the RLTrainer(s) maintained by this TrainerRunner.
+        """Do one gradient based update to the RLTrainer(s) maintained by this TrainerRunner.
 
         Args:
             batch: The data to use for the update.

From 6346a20f01e223885f69da9e89bb0498a009b1b7 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 24 Jan 2023 23:22:24 -0800
Subject: [PATCH 059/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/ppo.py             | 23 ++++++----
 rllib/core/rl_trainer/trainer_runner.py | 59 ++++++++++++++++++++-----
 2 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 72f65a49a047..b63bed35fd0b 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -380,18 +380,25 @@ def training_step(self) -> ResultDict:
         # Train
         if self.config._enable_rl_trainer_api:
             # TODO (Kourosh) Clearly define what train_batch_size
-            # vs. sgd_minibatch_size and num_sgd_iter is.
+            # vs. sgd_minibatch_size and num_sgd_iter is in the config.
             # TODO (Kourosh) Do this inside the RL Trainer so
             # that we don't have to this back and forth
             # communication between driver and the remote
             # trainer workers
-            for epoch in range(self.config.num_sgd_iter):
-                # bsize = self.config.sgd_minibatch_size
-                # for minibatch in SampleBatchLoader(train_batch, batch_size=bsize):
-                #     train_results = self.trainer_runner.update(minibatch)
-                # TODO (Kourosh) The output of trainer_runner.update() should be
-                # one item not a list of items
-                train_results = self.trainer_runner.update(train_batch)[0]
+
+            train_results = self.trainer_runner.fit(
+                train_batch,
+                minibatch_size=self.config.sgd_minibatch_size,
+                num_iters=self.config.num_sgd_iter,
+            )
+            # for epoch in range(self.config.num_sgd_iter):
+            #     # bsize = self.config.sgd_minibatch_size
+            #     # for minibatch in SampleBatchLoader(train_batch, batch_size=bsize):
+            #     #     train_results = self.trainer_runner.update(minibatch)
+            #     # TODO (Kourosh) The output of trainer_runner.update() should be
+            #     # one item not a list of items
+
+            #     train_results = self.trainer_runner.update(train_batch)[0]
         elif self.config.simple_optimizer:
             train_results = train_one_step(self, train_batch)
         else:
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 2d9951e248e6..37c009e7c461 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -1,8 +1,11 @@
 import math
 from typing import Any, List, Mapping, Type, Optional, Callable, Dict
+import tree  # pip install dm-tree
+import numpy as np
 
 import ray
 
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.core.rl_module.rl_module import (
     RLModule,
     ModuleID,
@@ -98,22 +101,54 @@ def __init__(
             self._trainer = trainer_class(**trainer_config)
             self._trainer.build()
 
-
-    def fit(self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int) -> Mapping[str, Any]:
+    def fit(
+        self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int
+    ) -> Mapping[str, Any]:
         """Do `num_iters` minibatch updates given the original batch.
-        
-        Given a batch of episodes you can use this method to take more 
-        than one backward pass on the batch. 
+
+        Given a batch of episodes you can use this method to take more
+        than one backward pass on the batch. The same minibatch_size and num_iters gets will be used for all module ids (previously known as policies) in the multiagent batch
 
         Args:
-            batch: 
-            minibatch_size:
-            num_iters: 
-        
+            batch: The data to use for the update.
+            minibatch_size: The size of the minibatch to use for each update.
+            num_iters: The number of minibatch updates to perform.
+
         Returns:
-            The training statistics of this fitting round. 
+            A dictionary of results summarizing the statistics of the updates.
         """
-        
+
+        start = {mid: 0 for mid in batch.policy_batches.keys()}
+        results = []
+        for _ in range(num_iters):
+            minibatch = {}
+            for module_id, module_batch in batch.policy_batches.items():
+                s = start[module_id]
+                e = s + minibatch_size
+
+                samples_to_concat = []
+                # cycle through the batch until we have enough samples
+                while e > len(module_batch):
+                    samples_to_concat.append(module_batch[s:])
+                    e = minibatch_size - len(module_batch[s:])
+                    s = 0
+
+                samples_to_concat.append(module_batch[s:e])
+
+                # concatenate all the samples, we should have minibatch_size of sample
+                # after this step
+                minibatch[module_id] = SampleBatch.concat_samples(samples_to_concat)
+                # roll back to zero when we reach the end of the batch
+                start[module_id] = e
+
+            # TODO (Kourosh): len(batch) is not correct here. However it's also not
+            # clear what the correct value should be. Since training does not depend on
+            # this it will be fine for now.
+            minibatch = MultiAgentBatch(minibatch, len(batch))
+            results.append(self.update(minibatch))
+
+        # return the average of the results using tree map
+        return tree.map_structure(lambda *x: np.mean(x), *results)
 
     def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
         """Do one gradient based update to the RLTrainer(s) maintained by this TrainerRunner.
@@ -145,7 +180,6 @@ def _distributed_update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]
         """
         refs = []
         global_size = len(self._workers)
-        batch_size = math.ceil(len(batch) / global_size)
         for i, worker in enumerate(self._workers):
             batch_to_send = {}
             for pid, sub_batch in batch.policy_batches.items():
@@ -153,6 +187,7 @@ def _distributed_update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]
                 start = batch_size * i
                 end = min(start + batch_size, len(sub_batch))
                 batch_to_send[pid] = sub_batch[int(start) : int(end)]
+            # TODO (Avnish): int(batch_size) ? How should we shard MA batches really?
             new_batch = MultiAgentBatch(batch_to_send, int(batch_size))
             refs.append(worker.update.remote(new_batch))
 

From eb5106f77164ed3ff3d3fad943966022edb649cc Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 25 Jan 2023 00:13:55 -0800
Subject: [PATCH 060/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/trainer_runner.py | 27 ++++++++++++++++---------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 37c009e7c461..5e1b2575f3e9 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -5,7 +5,7 @@
 
 import ray
 
-from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import concat_samples
 from ray.rllib.core.rl_module.rl_module import (
     RLModule,
     ModuleID,
@@ -112,35 +112,40 @@ def fit(
         Args:
             batch: The data to use for the update.
             minibatch_size: The size of the minibatch to use for each update.
-            num_iters: The number of minibatch updates to perform.
+            num_iters: The number of complete passes over all the sub-batches 
+                in the input multi-agent batch.
 
         Returns:
             A dictionary of results summarizing the statistics of the updates.
         """
 
         start = {mid: 0 for mid in batch.policy_batches.keys()}
+        num_covered_epochs = {mid: 0 for mid in batch.policy_batches.keys()}
         results = []
-        for _ in range(num_iters):
+        # loop until the number of passes through all modules batches reaches the num_iters 
+        while min(num_covered_epochs.values()) < num_iters:
             minibatch = {}
             for module_id, module_batch in batch.policy_batches.items():
-                s = start[module_id]
-                e = s + minibatch_size
+                s = start[module_id]  # start
+                e = s + minibatch_size  # end
 
                 samples_to_concat = []
                 # cycle through the batch until we have enough samples
-                while e > len(module_batch):
+                while e >= len(module_batch):
                     samples_to_concat.append(module_batch[s:])
                     e = minibatch_size - len(module_batch[s:])
                     s = 0
+                    num_covered_epochs[module_id] += 1
 
                 samples_to_concat.append(module_batch[s:e])
 
                 # concatenate all the samples, we should have minibatch_size of sample
                 # after this step
-                minibatch[module_id] = SampleBatch.concat_samples(samples_to_concat)
-                # roll back to zero when we reach the end of the batch
+                minibatch[module_id] = concat_samples(samples_to_concat)
+                # roll miniback to zero when we reach the end of the batch
                 start[module_id] = e
 
+
             # TODO (Kourosh): len(batch) is not correct here. However it's also not
             # clear what the correct value should be. Since training does not depend on
             # this it will be fine for now.
@@ -162,7 +167,7 @@ def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
         if self._distributed:
             return self._distributed_update(batch)
         else:
-            return [self._trainer.update(batch)]
+            return self._trainer.update(batch)
 
     def _distributed_update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
         """Do a gradient based update to the RLTrainers using DDP training.
@@ -191,7 +196,9 @@ def _distributed_update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]
             new_batch = MultiAgentBatch(batch_to_send, int(batch_size))
             refs.append(worker.update.remote(new_batch))
 
-        return ray.get(refs)
+        results = ray.get(refs)
+        # take an average across the result of all actors
+        return tree.map_structure(lambda *x: np.mean(x), *results)
 
     def additional_update(self, *args, **kwargs) -> List[Mapping[str, Any]]:
         """Apply additional non-gradient based updates to the RLTrainers.

From b2c01ade1131485d01c3191f92fbfaa9d7fca6f5 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 25 Jan 2023 14:43:15 -0800
Subject: [PATCH 061/112] multi-gpu test works now

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/trainer_runner.py | 13 ++++++-------
 rllib/policy/policy.py                  | 10 ++++++++--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 5e1b2575f3e9..ff68a9d07607 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -112,7 +112,7 @@ def fit(
         Args:
             batch: The data to use for the update.
             minibatch_size: The size of the minibatch to use for each update.
-            num_iters: The number of complete passes over all the sub-batches 
+            num_iters: The number of complete passes over all the sub-batches
                 in the input multi-agent batch.
 
         Returns:
@@ -122,7 +122,7 @@ def fit(
         start = {mid: 0 for mid in batch.policy_batches.keys()}
         num_covered_epochs = {mid: 0 for mid in batch.policy_batches.keys()}
         results = []
-        # loop until the number of passes through all modules batches reaches the num_iters 
+        # loop until the number of passes through all modules batches reaches the num_iters
         while min(num_covered_epochs.values()) < num_iters:
             minibatch = {}
             for module_id, module_batch in batch.policy_batches.items():
@@ -145,7 +145,6 @@ def fit(
                 # roll miniback to zero when we reach the end of the batch
                 start[module_id] = e
 
-
             # TODO (Kourosh): len(batch) is not correct here. However it's also not
             # clear what the correct value should be. Since training does not depend on
             # this it will be fine for now.
@@ -293,14 +292,14 @@ def set_weights(self, weights) -> None:
         # TODO (Kourosh) Set / get weight has to be thoroughly
         # tested across actors and multi-gpus
         if self._distributed:
-            ray.get([worker.set_weights(weights) for worker in self._workers])
+            ray.get([worker.set_weights.remote(weights) for worker in self._workers])
         else:
             self._trainer.set_weights(weights)
 
     def get_weights(self) -> Mapping[str, Any]:
         if self._distributed:
             worker = next(iter(self._workers))
-            return ray.get(worker.get_weights())
+            return ray.get(worker.get_weights.remote())
         else:
             return self._trainer.get_weights()
 
@@ -310,9 +309,9 @@ def get_state(self) -> List[Mapping[ModuleID, Mapping[str, Any]]]:
             refs = []
             for worker in self._workers:
                 refs.append(worker.get_state.remote())
-            return ray.get(refs)
+            return ray.get(refs)[0]
         else:
-            return [self._trainer.get_state()]
+            return self._trainer.get_state()
 
     def set_state(self, state: List[Mapping[ModuleID, Mapping[str, Any]]]) -> None:
         """Sets the states of the RLTrainers.
diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py
index f53f0561c131..667d116698f1 100644
--- a/rllib/policy/policy.py
+++ b/rllib/policy/policy.py
@@ -1193,6 +1193,7 @@ def _get_num_gpus_for_policy(self) -> int:
         """
         worker_idx = self.config.get("worker_index", 0)
         fake_gpus = self.config.get("_fake_gpus", False)
+
         if (
             ray._private.worker._mode() == ray._private.worker.LOCAL_MODE
             and not fake_gpus
@@ -1200,8 +1201,13 @@ def _get_num_gpus_for_policy(self) -> int:
             # If in local debugging mode, and _fake_gpus is not on.
             num_gpus = 0
         elif worker_idx == 0:
-            # If head node, take num_gpus.
-            num_gpus = self.config["num_gpus"]
+            # if we are in the new trainer wold num_gpus is only for trainer
+            # so use num_gpus_per_worker for policy sampling
+            if self.config["_enable_rl_trainer_api"]:
+                num_gpus = self.config["num_gpus_per_worker"]
+            else:
+                # If head node, take num_gpus.
+                num_gpus = self.config["num_gpus"]
         else:
             # If worker node, take num_gpus_per_worker
             num_gpus = self.config["num_gpus_per_worker"]

From ea3d9c60a3c9d00a483577ddf53caf9a495b996b Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 25 Jan 2023 15:06:42 -0800
Subject: [PATCH 062/112] removed left out api get_weight()

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/trainer_runner.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index ff68a9d07607..03472ce41eaf 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -279,15 +279,6 @@ def remove_module(self, module_id: ModuleID) -> None:
         else:
             self._trainer.remove_module(module_id)
 
-    def get_weight(self) -> Dict:
-        """Get the weights of the MARLModule.
-
-        Returns:
-            The weights of the neural networks that can be exchanged with the policy.
-        """
-        # TODO (Avnish): implement this.
-        pass
-
     def set_weights(self, weights) -> None:
         # TODO (Kourosh) Set / get weight has to be thoroughly
         # tested across actors and multi-gpus

From 45830c6a46e9c96710cb6a63f9617f5d45a21df5 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Wed, 25 Jan 2023 16:48:54 -0800
Subject: [PATCH 063/112] get_weights() updated

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/ppo.py                     | 11 ++---------
 rllib/core/rl_trainer/rl_trainer.py             |  3 ++-
 rllib/core/rl_trainer/torch/torch_rl_trainer.py | 10 ++++++++--
 rllib/core/rl_trainer/trainer_runner.py         |  8 ++++----
 rllib/evaluation/worker_set.py                  |  5 +++--
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index b63bed35fd0b..2f481e2c2e44 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -391,14 +391,7 @@ def training_step(self) -> ResultDict:
                 minibatch_size=self.config.sgd_minibatch_size,
                 num_iters=self.config.num_sgd_iter,
             )
-            # for epoch in range(self.config.num_sgd_iter):
-            #     # bsize = self.config.sgd_minibatch_size
-            #     # for minibatch in SampleBatchLoader(train_batch, batch_size=bsize):
-            #     #     train_results = self.trainer_runner.update(minibatch)
-            #     # TODO (Kourosh) The output of trainer_runner.update() should be
-            #     # one item not a list of items
-
-            #     train_results = self.trainer_runner.update(train_batch)[0]
+
         elif self.config.simple_optimizer:
             train_results = train_one_step(self, train_batch)
         else:
@@ -429,7 +422,7 @@ def training_step(self) -> ResultDict:
                     from_worker = self.trainer_runner
                 self.workers.sync_weights(
                     from_worker=from_worker,
-                    policies=list(train_results.keys()),
+                    policies=policies_to_update,
                     global_vars=global_vars,
                 )
             elif self.config._enable_rl_trainer_api:
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index e3e8bbc2e475..40667678c0c9 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -11,6 +11,7 @@
     Mapping,
     Optional,
     Sequence,
+    Set,
     Tuple,
     Type,
     Union,
@@ -409,7 +410,7 @@ def apply_gradients(self, gradients: Dict[ParamRef, TensorType]) -> None:
         """
 
     @abc.abstractmethod
-    def get_weights(self) -> Mapping[str, Any]:
+    def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any]:
         """Returns the state of the underlying MultiAgentRLModule"""
 
     @abc.abstractmethod
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index ee0caa219c1c..8d67bca34a4b 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -9,6 +9,7 @@
     Optional,
     Callable,
     TYPE_CHECKING,
+    Set
 )
 
 from ray.rllib.core.rl_module.rl_module import (
@@ -167,9 +168,14 @@ def do_distributed_update(self, batch: MultiAgentBatch) -> Mapping[str, Any]:
         # in torch the distributed update is no different than the normal update
         return self._update(batch)
 
-    def get_weights(self) -> Mapping[str, Any]:
+    def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any]:
         """Returns the state of the underlying MultiAgentRLModule"""
-        return self._module.get_state()
+
+        module_weights = self._module.get_state()
+        if module_ids is None:
+            return module_weights
+        
+        return {k: v for k, v in module_weights.items() if k in module_ids}
 
     def set_weights(self, weights: Mapping[str, Any]) -> None:
         """Sets the state of the underlying MultiAgentRLModule"""
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 03472ce41eaf..744c2acaec64 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -1,5 +1,5 @@
 import math
-from typing import Any, List, Mapping, Type, Optional, Callable, Dict
+from typing import Any, List, Mapping, Type, Optional, Callable, Set
 import tree  # pip install dm-tree
 import numpy as np
 
@@ -287,12 +287,12 @@ def set_weights(self, weights) -> None:
         else:
             self._trainer.set_weights(weights)
 
-    def get_weights(self) -> Mapping[str, Any]:
+    def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any]:
         if self._distributed:
             worker = next(iter(self._workers))
-            return ray.get(worker.get_weights.remote())
+            return ray.get(worker.get_weights.remote(module_ids))
         else:
-            return self._trainer.get_weights()
+            return self._trainer.get_weights(module_ids)
 
     def get_state(self) -> List[Mapping[ModuleID, Mapping[str, Any]]]:
         """Get the states of the RLTrainers"""
diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py
index 6dfd566bd0a1..fc6c85830dbe 100644
--- a/rllib/evaluation/worker_set.py
+++ b/rllib/evaluation/worker_set.py
@@ -413,7 +413,8 @@ def sync_weights(
         # Only sync if we have remote workers or `from_worker` is provided.
         weights = None
         if self.num_remote_workers() or from_worker is not None:
-            weights = (from_worker or self.local_worker()).get_weights(policies)
+            worker_or_trainer = from_worker or self.local_worker()
+            weights = worker_or_trainer.get_weights(policies)
 
             def set_weight(w):
                 w.set_weights(weights, global_vars)
@@ -433,7 +434,7 @@ def set_weight(w):
         # If `from_worker` is provided, also sync to this WorkerSet's
         # local worker.
         if self.local_worker() is not None:
-            if from_worker is not None:
+            if worker_or_trainer is not None:
                 self.local_worker().set_weights(weights, global_vars=global_vars)
             # If `global_vars` is provided and local worker exists  -> Update its
             # global_vars.

From 94ca772f18bfe74ff4a246fcb2784c148f2aec06 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 26 Jan 2023 12:13:17 -0800
Subject: [PATCH 064/112] trying out a new configuration pattern for trainer
 runner and rl trainers

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           | 59 +++++++++++++++++--
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     | 19 +++---
 .../core/rl_trainer/torch/torch_rl_trainer.py | 23 +++-----
 rllib/core/rl_trainer/trainer_runner.py       | 45 +++++++-------
 .../core/rl_trainer/trainer_runner_config.py  | 39 ++++++++++++
 rllib/utils/params.py                         | 10 ++++
 6 files changed, 143 insertions(+), 52 deletions(-)
 create mode 100644 rllib/utils/params.py

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 8969cab69a1e..ebdd4889a967 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -31,10 +31,12 @@
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import TensorType
+from ray.rllib.core.rl_trainer.trainer_runner_config import RLTrainerScalingConfig
 
 if TYPE_CHECKING:
     from ray.air.config import ScalingConfig
     from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+    from ray.rllib.core.rl_trainer.trainer_runner_config import Hyperparams
 
 torch, _ = try_import_torch()
 tf1, tf, tfv = try_import_tf()
@@ -46,7 +48,47 @@
 ParamOptimizerPairs = List[Tuple[Sequence[ParamType], Optimizer]]
 ParamRef = Hashable
 ParamDictType = Dict[ParamRef, ParamType]
+HyperparamType = Union[AlgorithmConfig, Hyperparams] 
+
+class RLTrainerSpec:
+    # The RLTrainer class to use.
+    rl_trainer_class: Type["RLTrainer"] = None
+    # The underlying (MA)RLModule spec to completely define the module
+    module_spec: Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec] = None
+    # Alternatively the RLModule instance can be passed in directly (won't work if 
+    # RLTrainer is an actor)
+    module: Optional[RLModule] = None,
+    # The scaling config for properly distributing the RLModule
+    scaling_config: RLTrainerScalingConfig = None
+    # The optimizer setting to apply during training
+    optimizer_config: Dict[str, Any] = {}
+    # The extra config for the loss/additional update specific hyper-parameters
+    # for now we assume we can get both algorithm config or a dict that contains the 
+    # hyper-parameters
+    trainer_hyperparameters: HyperparamType= {}
+
+    def __post_init__(self):
+        if not isinstance(self.trainer_hyperparameters, AlgorithmConfig):
+            self.trainer_hyperparameters = Hyperparams(
+                self.trainer_hyperparameters
+            )
+
+    def get_params_dict(self) -> Dict[str, Any]:
+        return {
+            "module": self.module,
+            "module_spec": self.module_spec,
+            "scaling_config": self.scaling_config,
+            "optimizer_config": self.optimizer_config,
+            "trainer_hyperparameters": self.trainer_hyperparameters,
+        }
 
+    def build(self):
+        return self.rl_trainer_class(
+            module_spec=self.module_spec,
+            optimizer_config=self.optimizer_config,
+            scaling_config=self.scaling_config,
+            algorithm_config=self.trainer_hyperparameters,
+        )
 
 class RLTrainer:
     """Base class for RLlib algorithm trainers.
@@ -118,9 +160,8 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        distributed: bool = False,
-        scaling_config: Optional["ScalingConfig"] = None,
-        algorithm_config: Optional["AlgorithmConfig"] = None,
+        scaling_config: Optional[RLTrainerScalingConfig] = None,
+        trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         # TODO (Kourosh): Having the entire algorithm_config inside trainer may not be
         # the best idea in the world, but it's easy to implement and user will
@@ -140,9 +181,11 @@ def __init__(
         self.module_spec = module_spec
         self.module_obj = module
         self.optimizer_config = optimizer_config
-        self.distributed = distributed
-        self.scaling_config = scaling_config
-        self.config = algorithm_config
+        self.config = trainer_hyperparameters
+
+        # pick the configs that we need for the trainer from scaling config
+        scaling_config = scaling_config or RLTrainerScalingConfig()
+        self._distributed = scaling_config.distributed
 
         # These are the attributes that are set during build
         self._module: MultiAgentRLModule = None
@@ -151,6 +194,10 @@ def __init__(
         self._param_to_optim: Dict[ParamRef, Optimizer] = {}
         self._params: ParamDictType = {}
 
+    @property
+    def distributed(self) -> bool:
+        return self._distributed
+
     @property
     def module(self) -> MultiAgentRLModule:
         return self._module
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 44a8f6d21581..f27c8d046be3 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -19,6 +19,7 @@
     Optimizer,
     ParamType,
     ParamDictType,
+    HyperparamType
 )
 from ray.rllib.core.rl_module.rl_module import (
     RLModule,
@@ -34,11 +35,13 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
+from ray.rllib.core.rl_trainer.trainer_runner_config import TFRLTrainerScalingConfig
 
 if TYPE_CHECKING:
     from ray.air.config import ScalingConfig
     from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
 
+
 tf1, tf, tfv = try_import_tf()
 
 logger = logging.getLogger(__name__)
@@ -98,19 +101,16 @@ def __init__(
             Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
         ] = None,
         module: Optional[RLModule] = None,
-        optimizer_config: Mapping[str, Any],
-        distributed: bool = False,
-        enable_tf_function: bool = True,
-        scaling_config: Optional["ScalingConfig"] = None,
-        algorithm_config: Optional["AlgorithmConfig"] = None,
+        optimizer_config: Mapping[str, Any] = None,
+        scaling_config: Optional[TFRLTrainerScalingConfig] = None,
+        trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         super().__init__(
             module_spec=module_spec,
             module=module,
             optimizer_config=optimizer_config,
-            distributed=distributed,
-            scaling_config=scaling_config,
-            algorithm_config=algorithm_config,
+            scaling_config=scaling_config,  
+            trainer_hyperparameters=trainer_hyperparameters,
         )
 
         # TODO (Kourosh): This is required to make sure tf computes the values in the
@@ -121,7 +121,8 @@ def __init__(
         # does not mention this as a requirement?
         tf1.enable_eager_execution()
 
-        self._enable_tf_function = enable_tf_function
+        scaling_config = scaling_config or TFRLTrainerScalingConfig()
+        self._enable_tf_function = scaling_config.enable_tf_function
         if self._enable_tf_function:
             self._update_fn = tf.function(self._do_update_fn)
         else:
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index b8313b47a584..66f776297249 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -26,6 +26,7 @@
     Optimizer,
     ParamType,
     ParamDictType,
+    HyperparamType
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
 from ray.rllib.policy.sample_batch import MultiAgentBatch
@@ -39,13 +40,11 @@
 if torch:
     from ray.air.config import ScalingConfig
     from ray.train.torch.train_loop_utils import _TorchAccelerator
+    from ray.rllib.core.rl_trainer.trainer_runner_config import TorchRLTrainerScalingConfig
 
-if TYPE_CHECKING:
-    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
 
 logger = logging.getLogger(__name__)
 
-
 class TorchRLTrainer(RLTrainer):
 
     framework: str = "torch"
@@ -57,24 +56,20 @@ def __init__(
             Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
         ] = None,
         module: Optional[RLModule] = None,
-        optimizer_config: Mapping[str, Any],
-        distributed: bool = False,
-        scaling_config: Optional["ScalingConfig"] = None,
-        algorithm_config: Optional["AlgorithmConfig"] = None,
+        optimizer_config: Mapping[str, Any] = None,
+        scaling_config: Optional[TorchRLTrainerScalingConfig] = None,
+        trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         super().__init__(
             module_spec=module_spec,
             module=module,
             optimizer_config=optimizer_config,
-            distributed=distributed,
-            scaling_config=scaling_config,
-            algorithm_config=algorithm_config,
+            scaling_config=scaling_config,  
+            trainer_hyperparameters=trainer_hyperparameters,
         )
 
-        # TODO (Kourosh): Scaling config is required for torch trainer to do proper DDP
-        # wraping setup but not so much required for tf. we need to
-        scaling_config = scaling_config or ScalingConfig()
-        self._world_size = scaling_config.num_workers or 1
+        # pick the stuff that we need from the scaling config
+        scaling_config = scaling_config or TorchRLTrainerScalingConfig()
         self._use_gpu = scaling_config.use_gpu
 
         # These attributes are set in the `build` method.
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index c6fcda3deacf..5bd995c85acc 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -1,5 +1,5 @@
 import math
-from typing import Any, List, Mapping, Type, Optional, Callable, Dict
+from typing import Any, List, Mapping, Type, Optional, Callable, Dict, TYPE_CHECKING
 
 import ray
 
@@ -14,12 +14,15 @@
     Optimizer,
 )
 
+
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 
 
 from ray.air.config import ScalingConfig
 from ray.train._internal.backend_executor import BackendExecutor
 
+if TYPE_CHECKING:
+    from ray.rllib.core.rl_trainer.trainer_runner_config import TrainerRunnerScalingConfig
 
 class TrainerRunner:
     """Coordinator of RLTrainers.
@@ -49,18 +52,16 @@ def __init__(
         self,
         trainer_class: Type[RLTrainer],
         trainer_config: Mapping[str, Any],
-        compute_config: Mapping[str, Any],
+        scaling_config: Optional[TrainerRunnerScalingConfig] = None
     ):
-        num_gpus = compute_config.get("num_gpus", 0)
-        use_fake_gpus = compute_config.get("_use_fake_gpus", False)
+        scaling_config = scaling_config or TrainerRunnerScalingConfig()
         self._trainer_config = trainer_config
 
-        if num_gpus > 0:
-            scaling_config = ScalingConfig(
-                num_workers=num_gpus,
-                use_gpu=(not use_fake_gpus),
-            )
-
+        self._is_local = scaling_config.local
+        if self._is_local:
+            self._trainer = trainer_class(**trainer_config, distributed=False)
+            self._trainer.build()
+        else:
             if trainer_class.framework == "torch":
                 from ray.train.torch import TorchConfig
 
@@ -82,10 +83,9 @@ def __init__(
 
             # TODO(avnishn, kourosh): Should we pass in scaling config into the
             # trainer?
-            trainer_config["distributed"] = self._distributed = bool(num_gpus > 1)
-            trainer_config["scaling_config"] = scaling_config
+            is_module_distributed = scaling_config.num_workers > 1
             self.backend_executor.start(
-                train_cls=trainer_class, train_cls_kwargs=trainer_config
+                train_cls=trainer_class, train_cls_kwargs=dict(**trainer_config, distributed=is_module_distributed, scaling_config=scaling_config)
             )
             self._workers = [
                 w.actor for w in self.backend_executor.worker_group.workers
@@ -93,10 +93,9 @@ def __init__(
 
             ray.get([w.build.remote() for w in self._workers])
 
-        else:
-            trainer_config["distributed"] = self._distributed = False
-            self._trainer = trainer_class(**trainer_config)
-            self._trainer.build()
+    @property
+    def is_local(self) -> bool:
+        return not self._is_local
 
     def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
         """Do a gradient based update to the RLTrainer(s) maintained by this TrainerRunner.
@@ -107,7 +106,7 @@ def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
         Returns:
             A list of dictionaries of results from the updates from the RLTrainer(s)
         """
-        if self._distributed:
+        if self.is_local:
             return self._distributed_update(batch)
         else:
             return [self._trainer.update(batch)]
@@ -157,7 +156,7 @@ def additional_update(self, *args, **kwargs) -> List[Mapping[str, Any]]:
             A list of dictionaries of results from the updates from each worker.
         """
 
-        if self._distributed:
+        if self.is_local:
             refs = []
             for worker in self._workers:
                 refs.append(worker.additional_update.remote(*args, **kwargs))
@@ -186,7 +185,7 @@ def add_module(
             optimizer_cls: The optimizer class to use. If None, the set_optimizer_fn
                 should be provided.
         """
-        if self._distributed:
+        if self.is_local:
             refs = []
             for worker in self._workers:
                 ref = worker.add_module.remote(
@@ -212,7 +211,7 @@ def remove_module(self, module_id: ModuleID) -> None:
             module_id: The id of the module to remove.
 
         """
-        if self._distributed:
+        if self.is_local:
             refs = []
             for worker in self._workers:
                 ref = worker.remove_module.remote(module_id)
@@ -232,7 +231,7 @@ def get_weight(self) -> Dict:
 
     def get_state(self) -> List[Mapping[ModuleID, Mapping[str, Any]]]:
         """Get the states of the RLTrainers"""
-        if self._distributed:
+        if self.is_local:
             refs = []
             for worker in self._workers:
                 refs.append(worker.get_state.remote())
@@ -247,7 +246,7 @@ def set_state(self, state: List[Mapping[ModuleID, Mapping[str, Any]]]) -> None:
             state: The state of the RLTrainers
 
         """
-        if self._distributed:
+        if self.is_local:
             refs = []
             for worker in self._workers:
                 refs.append(worker.set_state.remote(state))
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index d193e7cbb1f8..c969ef75441d 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -1,4 +1,6 @@
 from typing import Type, Optional, TYPE_CHECKING, Union, Dict
+from dataclasses import dataclass
+
 from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.utils.from_config import NotProvided
@@ -11,6 +13,43 @@
 ModuleSpec = Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
 
 
+class RLTrainerScalingConfig:
+    """Base class for scaling config relevant to RLTrainer."""
+    distributed: bool = False
+
+class TorchRLTrainerScalingConfig(RLTrainerScalingConfig):
+    """Torch-specific scaling config relevant to TorchRLTrainer."""
+    use_gpu: bool = False
+
+class TFRLTrainerScalingConfig(RLTrainerScalingConfig):
+    """Place holder for TF-specific scaling config relevant to TFRLTrainer."""
+    enable_tf_function: bool = True
+
+@dataclass
+class TrainerRunnerScalingConfig:
+    """Configuratiom for scaling training actors.
+
+    Attributes:
+        local: If True, create a trainer in the current process. This is useful for 
+            debugging to be able to use breakpoints. If False, the trainers are created 
+            as Ray actors. 
+        num_workers: The number of workers to use for training. 
+        num_cpus_per_worker: The number of CPUs to allocate per worker.
+        num_gpus_per_worker: The number of GPUs to allocate per worker.
+        use_gpu: If True, use GPUs for training. It will be automatically set to True
+            if num_gpus_per_worker > 0.
+    """
+    local: bool = True
+    num_workers: int = 1
+    num_cpus_per_worker: int = 1
+    num_gpus_per_worker: int = 0
+    use_gpu: bool = False
+
+    def __post_init__(self):
+        if self.num_gpus_per_worker > 0:
+            self.use_gpu = True
+
+
 # TODO (Kourosh): We should make all configs come from a standard base class that
 # defines the general interfaces for validation, from_dict, to_dict etc.
 class TrainerRunnerConfig:
diff --git a/rllib/utils/params.py b/rllib/utils/params.py
new file mode 100644
index 000000000000..01873e68d45c
--- /dev/null
+++ b/rllib/utils/params.py
@@ -0,0 +1,10 @@
+from ray.rllib.utils.annotations import ExperimentalAPI
+
+@ExperimentalAPI
+class Hyperparams(dict):
+    """This is an extention of the dict class that allows access via `.` notation."""
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        self[key] = value

From 29ac2fb5cbea7b519f55d9b3f30edb4f15035baf Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 26 Jan 2023 14:32:46 -0800
Subject: [PATCH 065/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           |  49 +-------
 rllib/core/rl_trainer/rl_trainer_config.py    | 107 ++++++++++++++++++
 .../core/rl_trainer/tests/test_rl_trainer.py  |   6 +-
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |   4 +-
 .../core/rl_trainer/torch/torch_rl_trainer.py |   9 +-
 rllib/core/rl_trainer/trainer_runner.py       |  66 ++++++++---
 .../core/rl_trainer/trainer_runner_config.py  |  28 +----
 rllib/utils/params.py                         |   2 +
 8 files changed, 177 insertions(+), 94 deletions(-)
 create mode 100644 rllib/core/rl_trainer/rl_trainer_config.py

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index ebdd4889a967..e8cb88dffec7 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -31,12 +31,10 @@
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import TensorType
-from ray.rllib.core.rl_trainer.trainer_runner_config import RLTrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import (
+    RLTrainerScalingConfig, HyperparamType
+)
 
-if TYPE_CHECKING:
-    from ray.air.config import ScalingConfig
-    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
-    from ray.rllib.core.rl_trainer.trainer_runner_config import Hyperparams
 
 torch, _ = try_import_torch()
 tf1, tf, tfv = try_import_tf()
@@ -48,47 +46,6 @@
 ParamOptimizerPairs = List[Tuple[Sequence[ParamType], Optimizer]]
 ParamRef = Hashable
 ParamDictType = Dict[ParamRef, ParamType]
-HyperparamType = Union[AlgorithmConfig, Hyperparams] 
-
-class RLTrainerSpec:
-    # The RLTrainer class to use.
-    rl_trainer_class: Type["RLTrainer"] = None
-    # The underlying (MA)RLModule spec to completely define the module
-    module_spec: Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec] = None
-    # Alternatively the RLModule instance can be passed in directly (won't work if 
-    # RLTrainer is an actor)
-    module: Optional[RLModule] = None,
-    # The scaling config for properly distributing the RLModule
-    scaling_config: RLTrainerScalingConfig = None
-    # The optimizer setting to apply during training
-    optimizer_config: Dict[str, Any] = {}
-    # The extra config for the loss/additional update specific hyper-parameters
-    # for now we assume we can get both algorithm config or a dict that contains the 
-    # hyper-parameters
-    trainer_hyperparameters: HyperparamType= {}
-
-    def __post_init__(self):
-        if not isinstance(self.trainer_hyperparameters, AlgorithmConfig):
-            self.trainer_hyperparameters = Hyperparams(
-                self.trainer_hyperparameters
-            )
-
-    def get_params_dict(self) -> Dict[str, Any]:
-        return {
-            "module": self.module,
-            "module_spec": self.module_spec,
-            "scaling_config": self.scaling_config,
-            "optimizer_config": self.optimizer_config,
-            "trainer_hyperparameters": self.trainer_hyperparameters,
-        }
-
-    def build(self):
-        return self.rl_trainer_class(
-            module_spec=self.module_spec,
-            optimizer_config=self.optimizer_config,
-            scaling_config=self.scaling_config,
-            algorithm_config=self.trainer_hyperparameters,
-        )
 
 class RLTrainer:
     """Base class for RLlib algorithm trainers.
diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/rl_trainer_config.py
new file mode 100644
index 000000000000..af8846e50771
--- /dev/null
+++ b/rllib/core/rl_trainer/rl_trainer_config.py
@@ -0,0 +1,107 @@
+import abc
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Type, Union, TYPE_CHECKING
+
+from ray.rllib.utils.params import Hyperparams
+
+if TYPE_CHECKING:
+    from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec
+    from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
+    from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
+    from ray.rllib.algorithms.algorithm import AlgorithmConfig
+
+
+HyperparamType = Union["AlgorithmConfig", Hyperparams]
+
+@dataclass
+class RLTrainerScalingConfig:
+    """Base class for scaling config relevant to RLTrainer."""
+
+    def __post_init__(self):
+        self._distributed: bool = False
+
+    @property
+    def distributed(self) -> bool:
+        return self._distributed
+
+    def set_distributed(self, distributed: bool) -> "RLTrainerScalingConfig":
+        """Set the distributed flag.
+
+        _distibuted attribute should not be set directly at the time of constuction,
+        the caller should explicitly decide whether the rl_trainer should be
+        instiantiated in distributed mode or not.
+
+        Args:
+            distributed: If True, the rl trainer will be instantiated in distributed
+                mode.
+        """
+        self._distributed = distributed
+        return self
+
+
+@dataclass
+class TorchRLTrainerScalingConfig(RLTrainerScalingConfig):
+    """Torch-specific scaling config relevant to TorchRLTrainer."""
+
+    def __post_init__(self):
+        super().__post_init__()
+        self._use_gpu: bool = False
+
+    @property
+    def use_gpu(self) -> bool:
+        return self._use_gpu
+
+    def set_use_gpu(self, use_gpu: bool) -> "TorchRLTrainerScalingConfig":
+        """Set the use_gpu flag.
+
+        _use_gpu attribute should not be set directly at the time of constuction,
+        the caller should explicitly decide whether the torch rl_trainer should be using gpu or not
+
+        Args:
+            use_gpu: If True, the rl trainer will be setup to use the gpu.
+        """
+        self._use_gpu = use_gpu
+        return self
+
+
+@dataclass
+class TFRLTrainerScalingConfig(RLTrainerScalingConfig):
+    """Place holder for TF-specific scaling config relevant to TFRLTrainer."""
+
+    enable_tf_function: bool = True
+
+
+
+class RLTrainerSpec:
+    # The RLTrainer class to use.
+    rl_trainer_class: Type["RLTrainer"] = None
+    # The underlying (MA)RLModule spec to completely define the module
+    module_spec: Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec] = None
+    # Alternatively the RLModule instance can be passed in directly (won't work if
+    # RLTrainer is an actor)
+    module: Optional[RLModule] = (None,)
+    # The scaling config for properly distributing the RLModule
+    scaling_config: RLTrainerScalingConfig = None
+    # The optimizer setting to apply during training
+    optimizer_config: Dict[str, Any] = {}
+    # The extra config for the loss/additional update specific hyper-parameters
+    # for now we assume we can get both algorithm config or a dict that contains the
+    # hyper-parameters
+    trainer_hyperparameters: HyperparamType = {}
+
+    def __post_init__(self):
+        if isinstance(self.trainer_hyperparameters, abc.Mapping):
+            self.trainer_hyperparameters = Hyperparams(self.trainer_hyperparameters)
+
+    def get_params_dict(self) -> Dict[str, Any]:
+        return {
+            "module": self.module,
+            "module_spec": self.module_spec,
+            "scaling_config": self.scaling_config,
+            "optimizer_config": self.optimizer_config,
+            "trainer_hyperparameters": self.trainer_hyperparameters,
+        }
+
+    def build(self):
+        return self.rl_trainer_class(**self.get_params_dict())
+
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index 8af9b0ae7245..3077628de412 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -11,7 +11,7 @@
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-
+from ray.rllib.core.rl_trainer.trainer_runner_config import TFRLTrainerScalingConfig
 
 def get_trainer(distributed=False) -> RLTrainer:
     env = gym.make("CartPole-v1")
@@ -29,7 +29,9 @@ def get_trainer(distributed=False) -> RLTrainer:
             model_config={"hidden_dim": 32},
         ),
         optimizer_config={"lr": 1e-3},
-        distributed=distributed,
+        scaling_config=TFRLTrainerScalingConfig(
+            enable_tf_function=False,
+        ).set_distributed(distributed),
     )
 
     trainer.build()
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index f27c8d046be3..2641061fc04b 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -19,7 +19,7 @@
     Optimizer,
     ParamType,
     ParamDictType,
-    HyperparamType
+    HyperparamType,
 )
 from ray.rllib.core.rl_module.rl_module import (
     RLModule,
@@ -109,7 +109,7 @@ def __init__(
             module_spec=module_spec,
             module=module,
             optimizer_config=optimizer_config,
-            scaling_config=scaling_config,  
+            scaling_config=scaling_config,
             trainer_hyperparameters=trainer_hyperparameters,
         )
 
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 66f776297249..c955279698f5 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -26,7 +26,7 @@
     Optimizer,
     ParamType,
     ParamDictType,
-    HyperparamType
+    HyperparamType,
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
 from ray.rllib.policy.sample_batch import MultiAgentBatch
@@ -40,11 +40,14 @@
 if torch:
     from ray.air.config import ScalingConfig
     from ray.train.torch.train_loop_utils import _TorchAccelerator
-    from ray.rllib.core.rl_trainer.trainer_runner_config import TorchRLTrainerScalingConfig
+    from ray.rllib.core.rl_trainer.trainer_runner_config import (
+        TorchRLTrainerScalingConfig,
+    )
 
 
 logger = logging.getLogger(__name__)
 
+
 class TorchRLTrainer(RLTrainer):
 
     framework: str = "torch"
@@ -64,7 +67,7 @@ def __init__(
             module_spec=module_spec,
             module=module,
             optimizer_config=optimizer_config,
-            scaling_config=scaling_config,  
+            scaling_config=scaling_config,
             trainer_hyperparameters=trainer_hyperparameters,
         )
 
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 5bd995c85acc..831ec4535d10 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -1,5 +1,15 @@
 import math
-from typing import Any, List, Mapping, Type, Optional, Callable, Dict, TYPE_CHECKING
+from typing import (
+    Any,
+    List,
+    Mapping,
+    Type,
+    Optional,
+    Callable,
+    Dict,
+    TYPE_CHECKING,
+    cast,
+)
 
 import ray
 
@@ -12,6 +22,7 @@
     RLTrainer,
     ParamOptimizerPairs,
     Optimizer,
+    RLTrainerSpec,
 )
 
 
@@ -22,7 +33,12 @@
 from ray.train._internal.backend_executor import BackendExecutor
 
 if TYPE_CHECKING:
-    from ray.rllib.core.rl_trainer.trainer_runner_config import TrainerRunnerScalingConfig
+    from ray.rllib.core.rl_trainer.trainer_runner_config import (
+        TrainerRunnerScalingConfig,
+        TorchRLTrainerScalingConfig,
+        TFRLTrainerScalingConfig,
+    )
+
 
 class TrainerRunner:
     """Coordinator of RLTrainers.
@@ -50,30 +66,46 @@ class TrainerRunner:
 
     def __init__(
         self,
-        trainer_class: Type[RLTrainer],
-        trainer_config: Mapping[str, Any],
-        scaling_config: Optional[TrainerRunnerScalingConfig] = None
+        rl_trainer_spec: RLTrainerSpec,
+        scaling_config: Optional[TrainerRunnerScalingConfig] = None,
     ):
         scaling_config = scaling_config or TrainerRunnerScalingConfig()
-        self._trainer_config = trainer_config
+        rl_trainer_class = rl_trainer_spec.trainer_class
 
         self._is_local = scaling_config.local
         if self._is_local:
-            self._trainer = trainer_class(**trainer_config, distributed=False)
+            # in local mode the trainer is always not distributed
+            rl_trainer_spec.scaling_config.set_distributed(False)
+            self._trainer = rl_trainer_class(**rl_trainer_spec.get_params_dict())
             self._trainer.build()
         else:
-            if trainer_class.framework == "torch":
+            # in remote mode the trainer is distributed only if there are more than 1
+            # workers
+            is_trainer_distributed = scaling_config.num_workers > 1
+            rl_trainer_spec.scaling_config.set_distributed(is_trainer_distributed)
+
+            if rl_trainer_class.framework == "torch":
                 from ray.train.torch import TorchConfig
 
                 backend_config = TorchConfig()
-            elif trainer_class.framework == "tf":
+                trainer_scaling_config = cast(
+                    TorchRLTrainerScalingConfig, rl_trainer_spec.scaling_config
+                )
+
+                trainer_should_use_gpu = scaling_config.num_gpus_per_worker > 0
+                trainer_scaling_config.set_use_gpu(trainer_should_use_gpu)
+
+            elif rl_trainer_class.framework == "tf":
                 from ray.train.tensorflow import TensorflowConfig
 
                 backend_config = TensorflowConfig()
+                trainer_scaling_config = cast(
+                    TFRLTrainerScalingConfig, rl_trainer_spec.scaling_config
+                )
             else:
                 raise ValueError("framework must be either torch or tf")
 
-            self.backend_executor = BackendExecutor(
+            backend_executor = BackendExecutor(
                 backend_config=backend_config,
                 num_workers=scaling_config.num_workers,
                 num_cpus_per_worker=scaling_config.num_cpus_per_worker,
@@ -81,16 +113,14 @@ def __init__(
                 max_retries=0,
             )
 
-            # TODO(avnishn, kourosh): Should we pass in scaling config into the
-            # trainer?
-            is_module_distributed = scaling_config.num_workers > 1
-            self.backend_executor.start(
-                train_cls=trainer_class, train_cls_kwargs=dict(**trainer_config, distributed=is_module_distributed, scaling_config=scaling_config)
+            backend_executor.start(
+                train_cls=rl_trainer_class,
+                train_cls_kwargs=rl_trainer_spec.get_params_dict(),
             )
-            self._workers = [
-                w.actor for w in self.backend_executor.worker_group.workers
-            ]
 
+            self._workers = [w.actor for w in backend_executor.worker_group.workers]
+
+            # run the neural network building code on remote workers
             ray.get([w.build.remote() for w in self._workers])
 
     @property
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index c969ef75441d..0af4eb53a6f8 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -13,41 +13,23 @@
 ModuleSpec = Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
 
 
-class RLTrainerScalingConfig:
-    """Base class for scaling config relevant to RLTrainer."""
-    distributed: bool = False
-
-class TorchRLTrainerScalingConfig(RLTrainerScalingConfig):
-    """Torch-specific scaling config relevant to TorchRLTrainer."""
-    use_gpu: bool = False
-
-class TFRLTrainerScalingConfig(RLTrainerScalingConfig):
-    """Place holder for TF-specific scaling config relevant to TFRLTrainer."""
-    enable_tf_function: bool = True
-
 @dataclass
 class TrainerRunnerScalingConfig:
     """Configuratiom for scaling training actors.
 
     Attributes:
-        local: If True, create a trainer in the current process. This is useful for 
-            debugging to be able to use breakpoints. If False, the trainers are created 
-            as Ray actors. 
-        num_workers: The number of workers to use for training. 
+        local: If True, create a trainer in the current process. This is useful for
+            debugging to be able to use breakpoints. If False, the trainers are created
+            as Ray actors.
+        num_workers: The number of workers to use for training.
         num_cpus_per_worker: The number of CPUs to allocate per worker.
         num_gpus_per_worker: The number of GPUs to allocate per worker.
-        use_gpu: If True, use GPUs for training. It will be automatically set to True
-            if num_gpus_per_worker > 0.
     """
+
     local: bool = True
     num_workers: int = 1
     num_cpus_per_worker: int = 1
     num_gpus_per_worker: int = 0
-    use_gpu: bool = False
-
-    def __post_init__(self):
-        if self.num_gpus_per_worker > 0:
-            self.use_gpu = True
 
 
 # TODO (Kourosh): We should make all configs come from a standard base class that
diff --git a/rllib/utils/params.py b/rllib/utils/params.py
index 01873e68d45c..0b6e97205557 100644
--- a/rllib/utils/params.py
+++ b/rllib/utils/params.py
@@ -1,8 +1,10 @@
 from ray.rllib.utils.annotations import ExperimentalAPI
 
+
 @ExperimentalAPI
 class Hyperparams(dict):
     """This is an extention of the dict class that allows access via `.` notation."""
+
     def __getattr__(self, key):
         return self[key]
 

From 4714e207351da4cf1084aef7defc2685b6cd270a Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 26 Jan 2023 14:33:19 -0800
Subject: [PATCH 066/112] lint

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py            | 4 +++-
 rllib/core/rl_trainer/rl_trainer_config.py     | 3 +--
 rllib/core/rl_trainer/tests/test_rl_trainer.py | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index e8cb88dffec7..1a841fb4ca17 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -32,7 +32,8 @@
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.core.rl_trainer.rl_trainer_config import (
-    RLTrainerScalingConfig, HyperparamType
+    RLTrainerScalingConfig,
+    HyperparamType,
 )
 
 
@@ -47,6 +48,7 @@
 ParamRef = Hashable
 ParamDictType = Dict[ParamRef, ParamType]
 
+
 class RLTrainer:
     """Base class for RLlib algorithm trainers.
 
diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/rl_trainer_config.py
index af8846e50771..4bc3c3dffeec 100644
--- a/rllib/core/rl_trainer/rl_trainer_config.py
+++ b/rllib/core/rl_trainer/rl_trainer_config.py
@@ -13,6 +13,7 @@
 
 HyperparamType = Union["AlgorithmConfig", Hyperparams]
 
+
 @dataclass
 class RLTrainerScalingConfig:
     """Base class for scaling config relevant to RLTrainer."""
@@ -71,7 +72,6 @@ class TFRLTrainerScalingConfig(RLTrainerScalingConfig):
     enable_tf_function: bool = True
 
 
-
 class RLTrainerSpec:
     # The RLTrainer class to use.
     rl_trainer_class: Type["RLTrainer"] = None
@@ -104,4 +104,3 @@ def get_params_dict(self) -> Dict[str, Any]:
 
     def build(self):
         return self.rl_trainer_class(**self.get_params_dict())
-
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index 3077628de412..c9269af2acc0 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -13,6 +13,7 @@
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
 from ray.rllib.core.rl_trainer.trainer_runner_config import TFRLTrainerScalingConfig
 
+
 def get_trainer(distributed=False) -> RLTrainer:
     env = gym.make("CartPole-v1")
 

From abd5e5ee7bd99f173ba581fe7d9829a7a01e3371 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 26 Jan 2023 18:55:34 -0800
Subject: [PATCH 067/112] rl_trainer tf test passes again

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer_config.py    | 32 +++++++++++++++----
 .../core/rl_trainer/tests/test_rl_trainer.py  |  2 +-
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  2 +-
 .../core/rl_trainer/torch/torch_rl_trainer.py |  7 ++--
 rllib/core/rl_trainer/trainer_runner.py       | 11 ++-----
 5 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/rl_trainer_config.py
index 4bc3c3dffeec..b39ae37aecf4 100644
--- a/rllib/core/rl_trainer/rl_trainer_config.py
+++ b/rllib/core/rl_trainer/rl_trainer_config.py
@@ -1,5 +1,5 @@
 import abc
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Dict, Optional, Type, Union, TYPE_CHECKING
 
 from ray.rllib.utils.params import Hyperparams
@@ -72,22 +72,42 @@ class TFRLTrainerScalingConfig(RLTrainerScalingConfig):
     enable_tf_function: bool = True
 
 
+
+@dataclass
+class TrainerRunnerScalingConfig:
+    """Configuratiom for scaling training actors.
+
+    Attributes:
+        local: If True, create a trainer in the current process. This is useful for
+            debugging to be able to use breakpoints. If False, the trainers are created
+            as Ray actors.
+        num_workers: The number of workers to use for training.
+        num_cpus_per_worker: The number of CPUs to allocate per worker.
+        num_gpus_per_worker: The number of GPUs to allocate per worker.
+    """
+
+    local: bool = True
+    num_workers: int = 1
+    num_cpus_per_worker: int = 1
+    num_gpus_per_worker: int = 0
+
+@dataclass
 class RLTrainerSpec:
     # The RLTrainer class to use.
     rl_trainer_class: Type["RLTrainer"] = None
     # The underlying (MA)RLModule spec to completely define the module
-    module_spec: Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec] = None
+    module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None
     # Alternatively the RLModule instance can be passed in directly (won't work if
     # RLTrainer is an actor)
-    module: Optional[RLModule] = (None,)
+    module: Optional["RLModule"] = (None,)
     # The scaling config for properly distributing the RLModule
-    scaling_config: RLTrainerScalingConfig = None
+    scaling_config: "RLTrainerScalingConfig" = None
     # The optimizer setting to apply during training
-    optimizer_config: Dict[str, Any] = {}
+    optimizer_config: Dict[str, Any] = field(default_factory=dict)
     # The extra config for the loss/additional update specific hyper-parameters
     # for now we assume we can get both algorithm config or a dict that contains the
     # hyper-parameters
-    trainer_hyperparameters: HyperparamType = {}
+    trainer_hyperparameters: HyperparamType = field(default_factory=dict)
 
     def __post_init__(self):
         if isinstance(self.trainer_hyperparameters, abc.Mapping):
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index c9269af2acc0..5e8b17413165 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -11,7 +11,7 @@
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-from ray.rllib.core.rl_trainer.trainer_runner_config import TFRLTrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TFRLTrainerScalingConfig
 
 
 def get_trainer(distributed=False) -> RLTrainer:
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 2641061fc04b..986c313f8478 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -35,7 +35,7 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
-from ray.rllib.core.rl_trainer.trainer_runner_config import TFRLTrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TFRLTrainerScalingConfig
 
 if TYPE_CHECKING:
     from ray.air.config import ScalingConfig
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index c955279698f5..be380b0773f2 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -29,6 +29,9 @@
     HyperparamType,
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
+from ray.rllib.core.rl_trainer.rl_trainer_config import (
+        TorchRLTrainerScalingConfig,
+)
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
@@ -40,9 +43,7 @@
 if torch:
     from ray.air.config import ScalingConfig
     from ray.train.torch.train_loop_utils import _TorchAccelerator
-    from ray.rllib.core.rl_trainer.trainer_runner_config import (
-        TorchRLTrainerScalingConfig,
-    )
+
 
 
 logger = logging.getLogger(__name__)
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 831ec4535d10..756421f60da4 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -19,11 +19,10 @@
     SingleAgentRLModuleSpec,
 )
 from ray.rllib.core.rl_trainer.rl_trainer import (
-    RLTrainer,
     ParamOptimizerPairs,
     Optimizer,
-    RLTrainerSpec,
 )
+from ray.rllib.core.rl_trainer.rl_trainer_config import RLTrainerSpec
 
 
 from ray.rllib.policy.sample_batch import MultiAgentBatch
@@ -32,12 +31,8 @@
 from ray.air.config import ScalingConfig
 from ray.train._internal.backend_executor import BackendExecutor
 
-if TYPE_CHECKING:
-    from ray.rllib.core.rl_trainer.trainer_runner_config import (
-        TrainerRunnerScalingConfig,
-        TorchRLTrainerScalingConfig,
-        TFRLTrainerScalingConfig,
-    )
+from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLTrainerScalingConfig, TFRLTrainerScalingConfig, TrainerRunnerScalingConfig
+
 
 
 class TrainerRunner:

From a44c370bb3bebeae9cc46561229103aed8c9d003 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Thu, 26 Jan 2023 23:37:15 -0800
Subject: [PATCH 068/112] torch rl trainer test passed

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer_config.py      |  2 +-
 .../torch/tests/test_torch_rl_trainer.py        | 17 +++++++++--------
 rllib/core/rl_trainer/torch/torch_rl_trainer.py |  3 +--
 rllib/core/rl_trainer/trainer_runner.py         |  7 +++++--
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/rl_trainer_config.py
index b39ae37aecf4..9c5632dbb98f 100644
--- a/rllib/core/rl_trainer/rl_trainer_config.py
+++ b/rllib/core/rl_trainer/rl_trainer_config.py
@@ -72,7 +72,6 @@ class TFRLTrainerScalingConfig(RLTrainerScalingConfig):
     enable_tf_function: bool = True
 
 
-
 @dataclass
 class TrainerRunnerScalingConfig:
     """Configuratiom for scaling training actors.
@@ -91,6 +90,7 @@ class TrainerRunnerScalingConfig:
     num_cpus_per_worker: int = 1
     num_gpus_per_worker: int = 0
 
+
 @dataclass
 class RLTrainerSpec:
     # The RLTrainer class to use.
diff --git a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
index 40806048b6dc..f6d5fb0099a5 100644
--- a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
@@ -12,13 +12,13 @@
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
 from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLTrainerScalingConfig
 
 from ray.air.config import ScalingConfig
 
 
-def _get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
+def _get_trainer(distributed: bool = False) -> RLTrainer:
     env = gym.make("CartPole-v1")
-    scaling_config = scaling_config or ScalingConfig()
     distributed = False
 
     # TODO: Another way to make RLTrainer would be to construct the module first
@@ -33,9 +33,10 @@ def _get_trainer(scaling_config=None, distributed: bool = False) -> RLTrainer:
             action_space=env.action_space,
             model_config={"hidden_dim": 32},
         ),
-        scaling_config=scaling_config,
+        scaling_config=TorchRLTrainerScalingConfig()
+        .set_distributed(distributed)
+        .set_use_gpu(False),
         optimizer_config={"lr": 1e-3},
-        distributed=distributed,
     )
 
     trainer.build()
@@ -54,7 +55,7 @@ def tearDown(cls) -> None:
 
     def test_end_to_end_update(self):
 
-        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
+        trainer = _get_trainer()
         reader = get_cartpole_dataset_reader(batch_size=512)
 
         min_loss = float("inf")
@@ -77,7 +78,7 @@ def test_compute_gradients(self):
         Tests that if we sum all the trainable variables the gradient of output w.r.t.
         the weights is all ones.
         """
-        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
+        trainer = _get_trainer()
 
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
         loss = {"total_loss": sum([param.sum() for param in params])}
@@ -96,7 +97,7 @@ def test_apply_gradients(self):
         standard SGD/Adam update rule.
         """
 
-        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
+        trainer = _get_trainer()
 
         # calculated the expected new params based on gradients of all ones.
         params = trainer.get_parameters(trainer.module[DEFAULT_POLICY_ID])
@@ -120,7 +121,7 @@ def test_add_remove_module(self):
         all variables the updated parameters follow the SGD update rule.
         """
         env = gym.make("CartPole-v1")
-        trainer = _get_trainer(scaling_config=ScalingConfig(num_workers=2))
+        trainer = _get_trainer()
 
         # add a test module with SGD optimizer with a known lr
         lr = 1e-4
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index be380b0773f2..06e59a155609 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -30,7 +30,7 @@
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
 from ray.rllib.core.rl_trainer.rl_trainer_config import (
-        TorchRLTrainerScalingConfig,
+    TorchRLTrainerScalingConfig,
 )
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
@@ -45,7 +45,6 @@
     from ray.train.torch.train_loop_utils import _TorchAccelerator
 
 
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 756421f60da4..2242f1ddd954 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -31,8 +31,11 @@
 from ray.air.config import ScalingConfig
 from ray.train._internal.backend_executor import BackendExecutor
 
-from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLTrainerScalingConfig, TFRLTrainerScalingConfig, TrainerRunnerScalingConfig
-
+from ray.rllib.core.rl_trainer.rl_trainer_config import (
+    TorchRLTrainerScalingConfig,
+    TFRLTrainerScalingConfig,
+    TrainerRunnerScalingConfig,
+)
 
 
 class TrainerRunner:

From e496fccba4d348d7135d7c8436defb0c3925551a Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 00:16:51 -0800
Subject: [PATCH 069/112] trainer_runner_config test works too

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm_config.py          | 19 ++++-
 rllib/core/rl_trainer/rl_trainer.py           |  1 -
 rllib/core/rl_trainer/rl_trainer_config.py    | 39 ++++++---
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  5 --
 .../torch/tests/test_torch_rl_trainer.py      |  2 -
 .../core/rl_trainer/torch/torch_rl_trainer.py |  2 -
 rllib/core/rl_trainer/trainer_runner.py       | 27 +++----
 .../core/rl_trainer/trainer_runner_config.py  | 79 +++++++------------
 8 files changed, 82 insertions(+), 92 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index f2bb985b72ef..c9cb04158171 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -242,6 +242,9 @@ def __init__(self, algo_class=None):
         self.num_gpus_per_worker = 0
         self._fake_gpus = False
         self.num_cpus_for_local_worker = 1
+        self.num_trainer_workers = 0
+        self.num_gpus_per_trainer_worker = 0
+        self.num_cpus_per_trainer_worker = 1
         self.custom_resources_per_worker = {}
         self.placement_strategy = "PACK"
 
@@ -946,6 +949,9 @@ def resources(
         num_cpus_per_worker: Optional[Union[float, int]] = NotProvided,
         num_gpus_per_worker: Optional[Union[float, int]] = NotProvided,
         num_cpus_for_local_worker: Optional[int] = NotProvided,
+        num_trainer_workers: Optional[int] = NotProvided,
+        num_cpus_per_trainer_worker: Optional[Union[float, int]] = NotProvided,
+        num_gpus_per_trainer_worker: Optional[Union[float, int]] = NotProvided,
         custom_resources_per_worker: Optional[dict] = NotProvided,
         placement_strategy: Optional[str] = NotProvided,
     ) -> "AlgorithmConfig":
@@ -1005,6 +1011,13 @@ def resources(
         if placement_strategy is not NotProvided:
             self.placement_strategy = placement_strategy
 
+        if num_trainer_workers is not NotProvided:
+            self.num_trainer_workers = num_trainer_workers
+        if num_cpus_per_trainer_worker is not NotProvided:
+            self.num_cpus_per_trainer_worker = num_cpus_per_trainer_worker
+        if num_gpus_per_trainer_worker is not NotProvided:
+            self.num_gpus_per_trainer_worker = num_gpus_per_trainer_worker
+
         return self
 
     def framework(
@@ -2637,7 +2650,11 @@ def get_trainer_runner_config(
                 # TODO (Kourosh): optimizer config can now be more complicated.
                 optimizer_config={"lr": self.lr},
             )
-            .resources(num_gpus=self.num_gpus, fake_gpus=self._fake_gpus)
+            .resources(
+                num_trainer_workers=self.num_trainer_workers,
+                num_cpus_per_trainer_worker=self.num_cpus_per_trainer_worker,
+                num_gpus_per_trainer_worker=self.num_gpus_per_trainer_worker,
+            )
             .algorithm(algorithm_config=self)
         )
 
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 1a841fb4ca17..69ff7684aaf8 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -14,7 +14,6 @@
     Tuple,
     Type,
     Union,
-    TYPE_CHECKING,
 )
 
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/rl_trainer_config.py
index 9c5632dbb98f..ae5ee6b2dd1e 100644
--- a/rllib/core/rl_trainer/rl_trainer_config.py
+++ b/rllib/core/rl_trainer/rl_trainer_config.py
@@ -1,8 +1,8 @@
-import abc
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional, Type, Union, TYPE_CHECKING
 
 from ray.rllib.utils.params import Hyperparams
+from ray.rllib.core.rl_module.torch import TorchRLModule
 
 if TYPE_CHECKING:
     from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec
@@ -56,7 +56,8 @@ def set_use_gpu(self, use_gpu: bool) -> "TorchRLTrainerScalingConfig":
         """Set the use_gpu flag.
 
         _use_gpu attribute should not be set directly at the time of constuction,
-        the caller should explicitly decide whether the torch rl_trainer should be using gpu or not
+        the caller should explicitly decide whether the torch rl_trainer should be
+        using gpu or not
 
         Args:
             use_gpu: If True, the rl trainer will be setup to use the gpu.
@@ -77,16 +78,17 @@ class TrainerRunnerScalingConfig:
     """Configuratiom for scaling training actors.
 
     Attributes:
-        local: If True, create a trainer in the current process. This is useful for
-            debugging to be able to use breakpoints. If False, the trainers are created
-            as Ray actors.
-        num_workers: The number of workers to use for training.
-        num_cpus_per_worker: The number of CPUs to allocate per worker.
-        num_gpus_per_worker: The number of GPUs to allocate per worker.
+        num_workers: The number of workers to use for training. num_workers=0 means you
+            have only one local worker (either on 1 CPU or 1 GPU)
+        num_cpus_per_worker: The number of CPUs to allocate per worker. If
+            num_workers=0 and num_gpus_per_worker=0, regardless of this value, the
+            training will run on a single CPU.
+        num_gpus_per_worker: The number of GPUs to allocate per worker. If
+            num_workers=0, any number greater than 0 will run the training on a single
+            GPU. A value of zero will run the training on a single CPU.
     """
 
-    local: bool = True
-    num_workers: int = 1
+    num_workers: int = 0
     num_cpus_per_worker: int = 1
     num_gpus_per_worker: int = 0
 
@@ -99,7 +101,7 @@ class RLTrainerSpec:
     module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None
     # Alternatively the RLModule instance can be passed in directly (won't work if
     # RLTrainer is an actor)
-    module: Optional["RLModule"] = (None,)
+    module: Optional["RLModule"] = None
     # The scaling config for properly distributing the RLModule
     scaling_config: "RLTrainerScalingConfig" = None
     # The optimizer setting to apply during training
@@ -110,9 +112,22 @@ class RLTrainerSpec:
     trainer_hyperparameters: HyperparamType = field(default_factory=dict)
 
     def __post_init__(self):
-        if isinstance(self.trainer_hyperparameters, abc.Mapping):
+        if isinstance(self.trainer_hyperparameters, dict):
             self.trainer_hyperparameters = Hyperparams(self.trainer_hyperparameters)
 
+        if self.scaling_config is None:
+            if self.module is not None:
+                if isinstance(self.module, TorchRLModule):
+                    self.scaling_config = TorchRLTrainerScalingConfig()
+                else:
+                    self.scaling_config = TFRLTrainerScalingConfig()
+
+            if self.module_spec is not None:
+                if issubclass(self.module_spec.module_class, TorchRLModule):
+                    self.scaling_config = TorchRLTrainerScalingConfig()
+                else:
+                    self.scaling_config = TFRLTrainerScalingConfig()
+
     def get_params_dict(self) -> Dict[str, Any]:
         return {
             "module": self.module,
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 986c313f8478..e9c98149e4ba 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -9,7 +9,6 @@
     Dict,
     Sequence,
     Hashable,
-    TYPE_CHECKING,
 )
 
 from ray.rllib.core.rl_trainer.rl_trainer import (
@@ -37,10 +36,6 @@
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.core.rl_trainer.rl_trainer_config import TFRLTrainerScalingConfig
 
-if TYPE_CHECKING:
-    from ray.air.config import ScalingConfig
-    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
-
 
 tf1, tf, tfv = try_import_tf()
 
diff --git a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
index f6d5fb0099a5..9405526bf56a 100644
--- a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
@@ -14,8 +14,6 @@
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLTrainerScalingConfig
 
-from ray.air.config import ScalingConfig
-
 
 def _get_trainer(distributed: bool = False) -> RLTrainer:
     env = gym.make("CartPole-v1")
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 06e59a155609..cb3688ca5708 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -8,7 +8,6 @@
     Hashable,
     Optional,
     Callable,
-    TYPE_CHECKING,
 )
 
 from ray.rllib.core.rl_module.rl_module import (
@@ -41,7 +40,6 @@
 torch, nn = try_import_torch()
 
 if torch:
-    from ray.air.config import ScalingConfig
     from ray.train.torch.train_loop_utils import _TorchAccelerator
 
 
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 2242f1ddd954..7dccc76531d2 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -7,8 +7,6 @@
     Optional,
     Callable,
     Dict,
-    TYPE_CHECKING,
-    cast,
 )
 
 import ray
@@ -28,12 +26,9 @@
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 
 
-from ray.air.config import ScalingConfig
 from ray.train._internal.backend_executor import BackendExecutor
 
 from ray.rllib.core.rl_trainer.rl_trainer_config import (
-    TorchRLTrainerScalingConfig,
-    TFRLTrainerScalingConfig,
     TrainerRunnerScalingConfig,
 )
 
@@ -68,9 +63,17 @@ def __init__(
         scaling_config: Optional[TrainerRunnerScalingConfig] = None,
     ):
         scaling_config = scaling_config or TrainerRunnerScalingConfig()
-        rl_trainer_class = rl_trainer_spec.trainer_class
+        rl_trainer_class = rl_trainer_spec.rl_trainer_class
 
-        self._is_local = scaling_config.local
+        # setup wether the worker should use gpu or not
+        if rl_trainer_class.framework == "torch":
+            trainer_should_use_gpu = scaling_config.num_gpus_per_worker > 0
+            rl_trainer_spec.scaling_config.set_use_gpu(trainer_should_use_gpu)
+        else:
+            # TODO (Avnish) How do I run TF on one GPU?
+            pass
+
+        self._is_local = scaling_config.num_workers == 0
         if self._is_local:
             # in local mode the trainer is always not distributed
             rl_trainer_spec.scaling_config.set_distributed(False)
@@ -86,20 +89,10 @@ def __init__(
                 from ray.train.torch import TorchConfig
 
                 backend_config = TorchConfig()
-                trainer_scaling_config = cast(
-                    TorchRLTrainerScalingConfig, rl_trainer_spec.scaling_config
-                )
-
-                trainer_should_use_gpu = scaling_config.num_gpus_per_worker > 0
-                trainer_scaling_config.set_use_gpu(trainer_should_use_gpu)
-
             elif rl_trainer_class.framework == "tf":
                 from ray.train.tensorflow import TensorflowConfig
 
                 backend_config = TensorflowConfig()
-                trainer_scaling_config = cast(
-                    TFRLTrainerScalingConfig, rl_trainer_spec.scaling_config
-                )
             else:
                 raise ValueError("framework must be either torch or tf")
 
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index 0af4eb53a6f8..e0de9f99c03d 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -1,10 +1,13 @@
 from typing import Type, Optional, TYPE_CHECKING, Union, Dict
-from dataclasses import dataclass
 
 from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.utils.from_config import NotProvided
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
+from ray.rllib.core.rl_trainer.rl_trainer_config import (
+    RLTrainerSpec,
+    TrainerRunnerScalingConfig,
+)
 
 if TYPE_CHECKING:
     from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
@@ -13,25 +16,6 @@
 ModuleSpec = Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
 
 
-@dataclass
-class TrainerRunnerScalingConfig:
-    """Configuratiom for scaling training actors.
-
-    Attributes:
-        local: If True, create a trainer in the current process. This is useful for
-            debugging to be able to use breakpoints. If False, the trainers are created
-            as Ray actors.
-        num_workers: The number of workers to use for training.
-        num_cpus_per_worker: The number of CPUs to allocate per worker.
-        num_gpus_per_worker: The number of GPUs to allocate per worker.
-    """
-
-    local: bool = True
-    num_workers: int = 1
-    num_cpus_per_worker: int = 1
-    num_gpus_per_worker: int = 0
-
-
 # TODO (Kourosh): We should make all configs come from a standard base class that
 # defines the general interfaces for validation, from_dict, to_dict etc.
 class TrainerRunnerConfig:
@@ -51,8 +35,9 @@ def __init__(self, cls: Type[TrainerRunner] = None) -> None:
         self.optimizer_config = None
 
         # `self.resources()`
-        self.num_gpus = 0
-        self.fake_gpus = False
+        self.num_gpus_per_trainer_worker = 0
+        self.num_cpus_per_trainer_worker = 1
+        self.num_trainer_workers = 1
 
         # `self.algorithm()`
         self.algorithm_config = None
@@ -82,34 +67,21 @@ def validate(self) -> None:
             # TODO (Kourosh): Change the optimizer config to a dataclass object.
             self.optimizer_config = {"lr": 1e-3}
 
-        if self.fake_gpus and self.num_gpus <= 0:
-            raise ValueError("If fake_gpus is True, num_gpus must be greater than 0.")
-
     def build(self) -> TrainerRunner:
         self.validate()
 
-        # If the module class is a multi agent class it will override the default
-        # MultiAgentRLModule class. otherwise, it will be a single agent wrapped with
-        # mutliagent
-        # TODO (Kourosh): What should be scaling_config? it's not clear what
-        # should be passed in as trainer_config and what will be inferred
-        return self.trainer_runner_class(
-            trainer_class=self.trainer_class,
-            trainer_config={
-                "module_spec": self.module_spec,
-                # TODO (Kourosh): should this be inferred inside the constructor?
-                "distributed": self.num_gpus > 1,
-                # TODO (Avnish): add this
-                # "enable_tf_function": self.eager_tracing,
-                "optimizer_config": self.optimizer_config,
-                "algorithm_config": self.algorithm_config,
-            },
-            compute_config={
-                "num_gpus": self.num_gpus,
-                # TODO (Avnish): add this
-                # "fake_gpus": self.fake_gpus,
-            },
+        rl_trainer_spec = RLTrainerSpec(
+            rl_trainer_class=self.trainer_class,
+            module_spec=self.module_spec,
+            optimizer_config=self.optimizer_config,
+            trainer_hyperparameters=self.algorithm_config,
+        )
+        scaling_config = TrainerRunnerScalingConfig(
+            num_workers=self.num_trainer_workers,
+            num_gpus_per_worker=self.num_gpus_per_trainer_worker,
+            num_cpus_per_worker=self.num_cpus_per_trainer_worker,
         )
+        return self.trainer_runner_class(rl_trainer_spec, scaling_config)
 
     def algorithm(
         self, algorithm_config: Optional["AlgorithmConfig"] = NotProvided
@@ -130,14 +102,17 @@ def module(
 
     def resources(
         self,
-        num_gpus: Optional[Union[float, int]] = NotProvided,
-        fake_gpus: Optional[bool] = NotProvided,
+        num_trainer_workers: Optional[int] = NotProvided,
+        num_gpus_per_trainer_worker: Optional[Union[float, int]] = NotProvided,
+        num_cpus_per_trainer_worker: Optional[Union[float, int]] = NotProvided,
     ) -> "TrainerRunnerConfig":
 
-        if num_gpus is not NotProvided:
-            self.num_gpus = num_gpus
-        if fake_gpus is not NotProvided:
-            self.fake_gpus = fake_gpus
+        if num_trainer_workers is not NotProvided:
+            self.num_trainer_workers = num_trainer_workers
+        if num_gpus_per_trainer_worker is not NotProvided:
+            self.num_gpus_per_trainer_worker = num_gpus_per_trainer_worker
+        if num_cpus_per_trainer_worker is not NotProvided:
+            self.num_cpus_per_trainer_worker = num_cpus_per_trainer_worker
 
         return self
 

From 477795d54f31a78b30e9fe05d1fba7e48c970c85 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 09:37:03 -0800
Subject: [PATCH 070/112] tested the multigpu

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../core/rl_trainer/tests/test_trainer_runner.py  | 15 +++++++++++----
 rllib/core/testing/utils.py                       | 13 +++++++++----
 rllib/utils/params.py                             |  8 ++++----
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 5743c891b4c4..3b1690697584 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -5,6 +5,7 @@
 
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
+from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerRunnerScalingConfig
 from ray.rllib.core.testing.utils import (
     get_trainer_runner,
     add_module_to_runner_or_trainer,
@@ -18,11 +19,11 @@ class TestTrainerRunner(unittest.TestCase):
     # multi-node multi-gpu, etc.
 
     @classmethod
-    def setUp(cls) -> None:
+    def setUpClass(cls) -> None:
         ray.init()
 
     @classmethod
-    def tearDown(cls) -> None:
+    def tearDownClass(cls) -> None:
         ray.shutdown()
 
     def test_update_multigpu(self):
@@ -32,7 +33,10 @@ def test_update_multigpu(self):
             ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
-            runner = get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
+            scaling_config = TrainerRunnerScalingConfig(
+                num_workers=2, num_gpus_per_worker=1
+            )
+            runner = get_trainer_runner(fw, env, scaling_config)
             reader = get_cartpole_dataset_reader(batch_size=500)
 
             min_loss = float("inf")
@@ -64,7 +68,10 @@ def test_add_remove_module(self):
             ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
-            runner = get_trainer_runner(fw, env, compute_config=dict(num_gpus=2))
+            scaling_config = TrainerRunnerScalingConfig(
+                num_workers=2, num_gpus_per_worker=1
+            )
+            runner = get_trainer_runner(fw, env, scaling_config)
             reader = get_cartpole_dataset_reader(batch_size=500)
             batch = reader.next()
 
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index bd96492bac50..83fe5292677f 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -4,6 +4,10 @@
 
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
+from ray.rllib.core.rl_trainer.rl_trainer_config import (
+    RLTrainerSpec,
+    TrainerRunnerScalingConfig,
+)
 
 from ray.rllib.core.rl_module.marl_module import (
     MultiAgentRLModuleSpec,
@@ -101,17 +105,18 @@ def get_rl_trainer(
 def get_trainer_runner(
     framework: str,
     env: "gym.Env",
-    compute_config: dict,
+    trainer_runner_scaling_config: TrainerRunnerScalingConfig,
     is_multi_agent: bool = False,
 ) -> TrainerRunner:
-    trainer_class = get_trainer_class(framework)
-    trainer_cfg = dict(
+
+    rl_trainer_spec = RLTrainerSpec(
+        rl_trainer_class=get_trainer_class(framework),
         module_spec=get_module_spec(
             framework=framework, env=env, is_multi_agent=is_multi_agent
         ),
         optimizer_config={"lr": 0.1},
     )
-    runner = TrainerRunner(trainer_class, trainer_cfg, compute_config=compute_config)
+    runner = TrainerRunner(rl_trainer_spec, trainer_runner_scaling_config)
 
     return runner
 
diff --git a/rllib/utils/params.py b/rllib/utils/params.py
index 0b6e97205557..4574d4bdd61f 100644
--- a/rllib/utils/params.py
+++ b/rllib/utils/params.py
@@ -6,7 +6,7 @@ class Hyperparams(dict):
     """This is an extention of the dict class that allows access via `.` notation."""
 
     def __getattr__(self, key):
-        return self[key]
-
-    def __setattr__(self, key, value):
-        self[key] = value
+        if key in self:
+            return self[key]
+        else:
+            return super().__getattr__(key)

From 58fe5dfca392fa30ea1636addce314cb8af4ddcb Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 09:45:27 -0800
Subject: [PATCH 071/112] docstring updated

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm_config.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index c9cb04158171..99976f9206d7 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -971,6 +971,22 @@ def resources(
                 fractional. This is usually needed only if your env itself requires a
                 GPU (i.e., it is a GPU-intensive video game), or model inference is
                 unusually expensive.
+            num_trainer_workers: The number of workers to use for training.
+                num_workers=0 means you have only one local worker (either on 1 CPU or
+                1 GPU which depends on the value and `num_gpus_per_trainer_worker`).
+                For multi-gpu training you have to set the number of workers to
+                something greater than 0, and set the number of gpus per worker
+                accordingly. For example, if a cluster has 4 GPUs total, and my model
+                needs 2 GPUs to fit, I can set num_train_workers=2 and
+                num_gpus_per_trainer_worker=2 to train on 2 GPUs per worker.
+            num_cpus_per_trainer_worker: The number of CPUs to allocate per worker. If
+                num_trainer_workers=0, num_gpus_per_trainer_worker = 0 , the training
+                will run on a single CPU and if num_gpus_per_trainer_worker > 0 it will
+                run on a single GPU (i.e. this value will be ignored)
+            num_gpus_per_trainer_worker: The number of GPUs to allocate per worker. If
+                num_workers=0, any number greater than 0 will run the training on a
+                single GPU. A value of zero will run the training on a single
+                CPU.
             custom_resources_per_worker: Any custom Ray resources to allocate per
                 worker.
             num_cpus_for_local_worker: Number of CPUs to allocate for the algorithm.

From d5bcd3b3b5980612a8d58e2a1a5607d597231fbb Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 10:07:13 -0800
Subject: [PATCH 072/112] updated the docstring

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm_config.py | 30 +++++++++++++---------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 99976f9206d7..8552f9a39bbd 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -971,22 +971,20 @@ def resources(
                 fractional. This is usually needed only if your env itself requires a
                 GPU (i.e., it is a GPU-intensive video game), or model inference is
                 unusually expensive.
-            num_trainer_workers: The number of workers to use for training.
-                num_workers=0 means you have only one local worker (either on 1 CPU or
-                1 GPU which depends on the value and `num_gpus_per_trainer_worker`).
-                For multi-gpu training you have to set the number of workers to
-                something greater than 0, and set the number of gpus per worker
-                accordingly. For example, if a cluster has 4 GPUs total, and my model
-                needs 2 GPUs to fit, I can set num_train_workers=2 and
-                num_gpus_per_trainer_worker=2 to train on 2 GPUs per worker.
-            num_cpus_per_trainer_worker: The number of CPUs to allocate per worker. If
-                num_trainer_workers=0, num_gpus_per_trainer_worker = 0 , the training
-                will run on a single CPU and if num_gpus_per_trainer_worker > 0 it will
-                run on a single GPU (i.e. this value will be ignored)
-            num_gpus_per_trainer_worker: The number of GPUs to allocate per worker. If
-                num_workers=0, any number greater than 0 will run the training on a
-                single GPU. A value of zero will run the training on a single
-                CPU.
+            num_trainer_workers: Number of workers used for training. A value of 0
+                means training will take place on a local worker on head node CPUs or 1
+                GPU (determined by `num_gpus_per_trainer_worker`). For multi-gpu
+                training, set number of workers greater than 1 and set
+                `num_gpus_per_trainer_worker` accordingly (e.g. 4 GPUs total, and model
+                needs 2 GPUs: `num_trainer_workers = 2` and
+                `num_gpus_per_trainer_worker = 2`)
+            num_cpus_per_trainer_worker: Number of CPUs allocated per trainer worker.
+                Only necessary for custom processing pipeline inside each RLTrainer
+                requiring multiple CPU cores. Ignored if `num_trainer_workers = 0`.
+            num_gpus_per_trainer_worker: Number of GPUs allocated per worker. If
+                `num_trainer_workers = 0`, any value greater than 0 will run the
+                training on a single GPU on the head node, while a value of 0 will run
+                the training on head node CPU cores.
             custom_resources_per_worker: Any custom Ray resources to allocate per
                 worker.
             num_cpus_for_local_worker: Number of CPUs to allocate for the algorithm.

From d8841d12eaf1186e7e200662ddf88c5e78f9fa26 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 16:33:20 -0800
Subject: [PATCH 073/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer_config.py    | 70 ++++++++++++++-----
 .../core/rl_trainer/tests/test_rl_trainer.py  |  4 +-
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  6 +-
 rllib/core/rl_trainer/trainer_runner.py       |  4 +-
 4 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/rl_trainer_config.py
index ae5ee6b2dd1e..0bdd8b235e75 100644
--- a/rllib/core/rl_trainer/rl_trainer_config.py
+++ b/rllib/core/rl_trainer/rl_trainer_config.py
@@ -16,9 +16,20 @@
 
 @dataclass
 class RLTrainerScalingConfig:
-    """Base class for scaling config relevant to RLTrainer."""
+    """Base class for scaling config relevant to RLTrainer.
+
+    Attributes:
+        distributed: If True, the rl_trainer will be instantiated in distributed mode.
+
+    Methods:
+        set_distributed: Set the distributed flag. _distibuted attribute should not be
+            set to True at the time of constructing the config. The caller should
+            explicitly decide whether the rl_trainer should be instiantiated in
+            distributed mode or not.
+    """
 
     def __post_init__(self):
+        super().__post_init__()
         self._distributed: bool = False
 
     @property
@@ -42,7 +53,16 @@ def set_distributed(self, distributed: bool) -> "RLTrainerScalingConfig":
 
 @dataclass
 class TorchRLTrainerScalingConfig(RLTrainerScalingConfig):
-    """Torch-specific scaling config relevant to TorchRLTrainer."""
+    """Torch-specific scaling config relevant to TorchRLTrainer.
+
+    Attributes:
+        use_gpu: If True, the torch rl_trainer will be setup to use the gpu.
+
+    Methods:
+        set_use_gpu: Set the use_gpu flag. _use_gpu attribute should not be set to True
+            at the time of constructing the config. The caller should explicitly decide
+            whether the torch rl_trainer should be using gpu or not.
+    """
 
     def __post_init__(self):
         super().__post_init__()
@@ -67,8 +87,14 @@ def set_use_gpu(self, use_gpu: bool) -> "TorchRLTrainerScalingConfig":
 
 
 @dataclass
-class TFRLTrainerScalingConfig(RLTrainerScalingConfig):
-    """Place holder for TF-specific scaling config relevant to TFRLTrainer."""
+class TfRLTrainerScalingConfig(RLTrainerScalingConfig):
+    """Tf-specific scaling config relevant to TFRLTrainer.
+
+    Args:
+        enable_tf_function: If True, the tf.function decorator will be used to
+            decorate the train_step function. This is recommended to boost performance
+            via tracing the graph.
+    """
 
     enable_tf_function: bool = True
 
@@ -95,20 +121,28 @@ class TrainerRunnerScalingConfig:
 
 @dataclass
 class RLTrainerSpec:
-    # The RLTrainer class to use.
-    rl_trainer_class: Type["RLTrainer"] = None
-    # The underlying (MA)RLModule spec to completely define the module
+    """The spec for construcitng RLTrainer actors.
+
+    Args:
+        rl_trainer_class: The RLTrainer class to use.
+        module_spec: The underlying (MA)RLModule spec to completely define the module.
+        module: Alternatively the RLModule instance can be passed in directly. This
+            only works if the RLTrainer is not an actor.
+        scaling_config: The scaling config for properly distributing the RLModule.
+        optimizer_config: The optimizer setting to apply during training.
+        trainer_hyperparameters: The extra config for the loss/additional update. The
+            items within this object should be accessible via a dot notation. For
+            example, if the trainer_hyperparameters contains {"coeff": 0.001}, then the
+            learning rate can be accessed via trainer_hyperparameters.coeff. This is
+            useful for passing in algorithm config or a HyperParams that contains the
+            hyper-parameters.
+    """
+
+    rl_trainer_class: Type["RLTrainer"]
     module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None
-    # Alternatively the RLModule instance can be passed in directly (won't work if
-    # RLTrainer is an actor)
     module: Optional["RLModule"] = None
-    # The scaling config for properly distributing the RLModule
     scaling_config: "RLTrainerScalingConfig" = None
-    # The optimizer setting to apply during training
     optimizer_config: Dict[str, Any] = field(default_factory=dict)
-    # The extra config for the loss/additional update specific hyper-parameters
-    # for now we assume we can get both algorithm config or a dict that contains the
-    # hyper-parameters
     trainer_hyperparameters: HyperparamType = field(default_factory=dict)
 
     def __post_init__(self):
@@ -120,15 +154,16 @@ def __post_init__(self):
                 if isinstance(self.module, TorchRLModule):
                     self.scaling_config = TorchRLTrainerScalingConfig()
                 else:
-                    self.scaling_config = TFRLTrainerScalingConfig()
+                    self.scaling_config = TfRLTrainerScalingConfig()
 
             if self.module_spec is not None:
                 if issubclass(self.module_spec.module_class, TorchRLModule):
                     self.scaling_config = TorchRLTrainerScalingConfig()
                 else:
-                    self.scaling_config = TFRLTrainerScalingConfig()
+                    self.scaling_config = TfRLTrainerScalingConfig()
 
     def get_params_dict(self) -> Dict[str, Any]:
+        """Returns the parameters than be passed to the RLTrainer constructor."""
         return {
             "module": self.module,
             "module_spec": self.module_spec,
@@ -137,5 +172,6 @@ def get_params_dict(self) -> Dict[str, Any]:
             "trainer_hyperparameters": self.trainer_hyperparameters,
         }
 
-    def build(self):
+    def build(self) -> "RLTrainer":
+        """Builds the RLTrainer instance."""
         return self.rl_trainer_class(**self.get_params_dict())
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index 5e8b17413165..c61b794edccf 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -11,7 +11,7 @@
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-from ray.rllib.core.rl_trainer.rl_trainer_config import TFRLTrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TfRLTrainerScalingConfig
 
 
 def get_trainer(distributed=False) -> RLTrainer:
@@ -30,7 +30,7 @@ def get_trainer(distributed=False) -> RLTrainer:
             model_config={"hidden_dim": 32},
         ),
         optimizer_config={"lr": 1e-3},
-        scaling_config=TFRLTrainerScalingConfig(
+        scaling_config=TfRLTrainerScalingConfig(
             enable_tf_function=False,
         ).set_distributed(distributed),
     )
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index e9c98149e4ba..68288c3937ed 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -34,7 +34,7 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
-from ray.rllib.core.rl_trainer.rl_trainer_config import TFRLTrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TfRLTrainerScalingConfig
 
 
 tf1, tf, tfv = try_import_tf()
@@ -97,7 +97,7 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        scaling_config: Optional[TFRLTrainerScalingConfig] = None,
+        scaling_config: Optional[TfRLTrainerScalingConfig] = None,
         trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         super().__init__(
@@ -116,7 +116,7 @@ def __init__(
         # does not mention this as a requirement?
         tf1.enable_eager_execution()
 
-        scaling_config = scaling_config or TFRLTrainerScalingConfig()
+        scaling_config = scaling_config or TfRLTrainerScalingConfig()
         self._enable_tf_function = scaling_config.enable_tf_function
         if self._enable_tf_function:
             self._update_fn = tf.function(self._do_update_fn)
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 7dccc76531d2..e4d3e00a28a6 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -28,9 +28,7 @@
 
 from ray.train._internal.backend_executor import BackendExecutor
 
-from ray.rllib.core.rl_trainer.rl_trainer_config import (
-    TrainerRunnerScalingConfig,
-)
+from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerRunnerScalingConfig
 
 
 class TrainerRunner:

From 026899ea348e7c7b5c0f378d2e831d441f5c6046 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 16:51:03 -0800
Subject: [PATCH 074/112] renamed the classes and variables to backend

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           |  8 ++---
 rllib/core/rl_trainer/rl_trainer_config.py    | 30 ++++++++++---------
 .../core/rl_trainer/tests/test_rl_trainer.py  |  9 +++---
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     | 10 +++----
 .../torch/tests/test_torch_rl_trainer.py      |  9 +++---
 .../core/rl_trainer/torch/torch_rl_trainer.py | 14 ++++-----
 6 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 69ff7684aaf8..40881f0a4071 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -31,7 +31,7 @@
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.core.rl_trainer.rl_trainer_config import (
-    RLTrainerScalingConfig,
+    RLModuleBackendConfig,
     HyperparamType,
 )
 
@@ -118,7 +118,7 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        scaling_config: Optional[RLTrainerScalingConfig] = None,
+        module_backend_config: Optional[RLModuleBackendConfig] = None,
         trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         # TODO (Kourosh): Having the entire algorithm_config inside trainer may not be
@@ -142,8 +142,8 @@ def __init__(
         self.config = trainer_hyperparameters
 
         # pick the configs that we need for the trainer from scaling config
-        scaling_config = scaling_config or RLTrainerScalingConfig()
-        self._distributed = scaling_config.distributed
+        module_backend_config = module_backend_config or RLModuleBackendConfig()
+        self._distributed = module_backend_config.distributed
 
         # These are the attributes that are set during build
         self._module: MultiAgentRLModule = None
diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/rl_trainer_config.py
index 0bdd8b235e75..da5ef37654d4 100644
--- a/rllib/core/rl_trainer/rl_trainer_config.py
+++ b/rllib/core/rl_trainer/rl_trainer_config.py
@@ -15,7 +15,7 @@
 
 
 @dataclass
-class RLTrainerScalingConfig:
+class RLModuleBackendConfig:
     """Base class for scaling config relevant to RLTrainer.
 
     Attributes:
@@ -36,7 +36,7 @@ def __post_init__(self):
     def distributed(self) -> bool:
         return self._distributed
 
-    def set_distributed(self, distributed: bool) -> "RLTrainerScalingConfig":
+    def set_distributed(self, distributed: bool) -> "RLModuleBackendConfig":
         """Set the distributed flag.
 
         _distibuted attribute should not be set directly at the time of constuction,
@@ -52,7 +52,7 @@ def set_distributed(self, distributed: bool) -> "RLTrainerScalingConfig":
 
 
 @dataclass
-class TorchRLTrainerScalingConfig(RLTrainerScalingConfig):
+class TorchRLModuleBackendConfig(RLModuleBackendConfig):
     """Torch-specific scaling config relevant to TorchRLTrainer.
 
     Attributes:
@@ -72,7 +72,7 @@ def __post_init__(self):
     def use_gpu(self) -> bool:
         return self._use_gpu
 
-    def set_use_gpu(self, use_gpu: bool) -> "TorchRLTrainerScalingConfig":
+    def set_use_gpu(self, use_gpu: bool) -> "TorchRLModuleBackendConfig":
         """Set the use_gpu flag.
 
         _use_gpu attribute should not be set directly at the time of constuction,
@@ -87,7 +87,7 @@ def set_use_gpu(self, use_gpu: bool) -> "TorchRLTrainerScalingConfig":
 
 
 @dataclass
-class TfRLTrainerScalingConfig(RLTrainerScalingConfig):
+class TfRLModuleBackendConfig(RLModuleBackendConfig):
     """Tf-specific scaling config relevant to TFRLTrainer.
 
     Args:
@@ -100,7 +100,7 @@ class TfRLTrainerScalingConfig(RLTrainerScalingConfig):
 
 
 @dataclass
-class TrainerRunnerScalingConfig:
+class TrainerScalingConfig:
     """Configuratiom for scaling training actors.
 
     Attributes:
@@ -128,7 +128,7 @@ class RLTrainerSpec:
         module_spec: The underlying (MA)RLModule spec to completely define the module.
         module: Alternatively the RLModule instance can be passed in directly. This
             only works if the RLTrainer is not an actor.
-        scaling_config: The scaling config for properly distributing the RLModule.
+        backend_config: The backend config for properly distributing the RLModule.
         optimizer_config: The optimizer setting to apply during training.
         trainer_hyperparameters: The extra config for the loss/additional update. The
             items within this object should be accessible via a dot notation. For
@@ -141,33 +141,35 @@ class RLTrainerSpec:
     rl_trainer_class: Type["RLTrainer"]
     module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None
     module: Optional["RLModule"] = None
-    scaling_config: "RLTrainerScalingConfig" = None
+    module_backend_config: "RLModuleBackendConfig" = None
     optimizer_config: Dict[str, Any] = field(default_factory=dict)
     trainer_hyperparameters: HyperparamType = field(default_factory=dict)
 
     def __post_init__(self):
+        # convert to hyper params object if needed
         if isinstance(self.trainer_hyperparameters, dict):
             self.trainer_hyperparameters = Hyperparams(self.trainer_hyperparameters)
 
-        if self.scaling_config is None:
+        # if module_backend_config is not set, we will create a dafault.
+        if self.module_backend_config is None:
             if self.module is not None:
                 if isinstance(self.module, TorchRLModule):
-                    self.scaling_config = TorchRLTrainerScalingConfig()
+                    self.module_backend_config = TorchRLModuleBackendConfig()
                 else:
-                    self.scaling_config = TfRLTrainerScalingConfig()
+                    self.module_backend_config = TfRLModuleBackendConfig()
 
             if self.module_spec is not None:
                 if issubclass(self.module_spec.module_class, TorchRLModule):
-                    self.scaling_config = TorchRLTrainerScalingConfig()
+                    self.module_backend_config = TorchRLModuleBackendConfig()
                 else:
-                    self.scaling_config = TfRLTrainerScalingConfig()
+                    self.module_backend_config = TfRLModuleBackendConfig()
 
     def get_params_dict(self) -> Dict[str, Any]:
         """Returns the parameters than be passed to the RLTrainer constructor."""
         return {
             "module": self.module,
             "module_spec": self.module_spec,
-            "scaling_config": self.scaling_config,
+            "scaling_config": self.module_backend_config,
             "optimizer_config": self.optimizer_config,
             "trainer_hyperparameters": self.trainer_hyperparameters,
         }
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index c61b794edccf..f5f5e5d23f28 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -11,7 +11,7 @@
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-from ray.rllib.core.rl_trainer.rl_trainer_config import TfRLTrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TfRLModuleBackendConfig
 
 
 def get_trainer(distributed=False) -> RLTrainer:
@@ -22,6 +22,9 @@ def get_trainer(distributed=False) -> RLTrainer:
     # out the serialization of RLModules we can simply pass the module the trainer
     # and internally it will serialize and deserialize the module for distributed
     # construction.
+    backend = TfRLModuleBackendConfig(enable_tf_function=False).set_distributed(
+        distributed
+    )
     trainer = BCTfRLTrainer(
         module_spec=SingleAgentRLModuleSpec(
             module_class=DiscreteBCTFModule,
@@ -30,9 +33,7 @@ def get_trainer(distributed=False) -> RLTrainer:
             model_config={"hidden_dim": 32},
         ),
         optimizer_config={"lr": 1e-3},
-        scaling_config=TfRLTrainerScalingConfig(
-            enable_tf_function=False,
-        ).set_distributed(distributed),
+        module_backend_config=backend,
     )
 
     trainer.build()
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 68288c3937ed..25fae930b6ce 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -34,7 +34,7 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
-from ray.rllib.core.rl_trainer.rl_trainer_config import TfRLTrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TfRLModuleBackendConfig
 
 
 tf1, tf, tfv = try_import_tf()
@@ -97,14 +97,14 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        scaling_config: Optional[TfRLTrainerScalingConfig] = None,
+        module_backend_config: Optional[TfRLModuleBackendConfig] = None,
         trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         super().__init__(
             module_spec=module_spec,
             module=module,
             optimizer_config=optimizer_config,
-            scaling_config=scaling_config,
+            module_backend_config=module_backend_config,
             trainer_hyperparameters=trainer_hyperparameters,
         )
 
@@ -116,8 +116,8 @@ def __init__(
         # does not mention this as a requirement?
         tf1.enable_eager_execution()
 
-        scaling_config = scaling_config or TfRLTrainerScalingConfig()
-        self._enable_tf_function = scaling_config.enable_tf_function
+        module_backend_config = module_backend_config or TfRLModuleBackendConfig()
+        self._enable_tf_function = module_backend_config.enable_tf_function
         if self._enable_tf_function:
             self._update_fn = tf.function(self._do_update_fn)
         else:
diff --git a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
index 9405526bf56a..b3b0f0023b53 100644
--- a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
@@ -12,7 +12,7 @@
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
 from ray.rllib.utils.numpy import convert_to_numpy
-from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLTrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLModuleBackendConfig
 
 
 def _get_trainer(distributed: bool = False) -> RLTrainer:
@@ -24,6 +24,9 @@ def _get_trainer(distributed: bool = False) -> RLTrainer:
     # out the serialization of RLModules we can simply pass the module the trainer
     # and internally it will serialize and deserialize the module for distributed
     # construction.
+    backend = (
+        TorchRLModuleBackendConfig().set_distributed(distributed).set_use_gpu(False)
+    )
     trainer = BCTorchRLTrainer(
         module_spec=SingleAgentRLModuleSpec(
             module_class=DiscreteBCTorchModule,
@@ -31,9 +34,7 @@ def _get_trainer(distributed: bool = False) -> RLTrainer:
             action_space=env.action_space,
             model_config={"hidden_dim": 32},
         ),
-        scaling_config=TorchRLTrainerScalingConfig()
-        .set_distributed(distributed)
-        .set_use_gpu(False),
+        module_backend_config=backend,
         optimizer_config={"lr": 1e-3},
     )
 
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index cb3688ca5708..4ee53a691165 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -28,9 +28,7 @@
     HyperparamType,
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
-from ray.rllib.core.rl_trainer.rl_trainer_config import (
-    TorchRLTrainerScalingConfig,
-)
+from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLModuleBackendConfig
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
@@ -58,22 +56,22 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        scaling_config: Optional[TorchRLTrainerScalingConfig] = None,
+        module_backend_config: Optional[TorchRLModuleBackendConfig] = None,
         trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         super().__init__(
             module_spec=module_spec,
             module=module,
             optimizer_config=optimizer_config,
-            scaling_config=scaling_config,
+            module_backend_config=module_backend_config,
             trainer_hyperparameters=trainer_hyperparameters,
         )
 
         # pick the stuff that we need from the scaling config
-        scaling_config = scaling_config or TorchRLTrainerScalingConfig()
-        self._use_gpu = scaling_config.use_gpu
+        module_backend_config = module_backend_config or TorchRLModuleBackendConfig()
+        self._use_gpu = module_backend_config.use_gpu
 
-        # These attributes are set in the `build` method.
+        # These attributes are set in the `TorchRLModuleBackendConfig
         self._device = None
 
     @property

From f04e99d7441058acc5c7eca462feb9af12f30e63 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 16:53:46 -0800
Subject: [PATCH 075/112] renamed

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/tests/test_trainer_runner.py | 10 +++-------
 rllib/core/rl_trainer/trainer_runner.py            |  6 +++---
 rllib/core/rl_trainer/trainer_runner_config.py     |  4 ++--
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 3b1690697584..c5b356618f5c 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -5,7 +5,7 @@
 
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
-from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerRunnerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerScalingConfig
 from ray.rllib.core.testing.utils import (
     get_trainer_runner,
     add_module_to_runner_or_trainer,
@@ -33,9 +33,7 @@ def test_update_multigpu(self):
             ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
-            scaling_config = TrainerRunnerScalingConfig(
-                num_workers=2, num_gpus_per_worker=1
-            )
+            scaling_config = TrainerScalingConfig(num_workers=2, num_gpus_per_worker=1)
             runner = get_trainer_runner(fw, env, scaling_config)
             reader = get_cartpole_dataset_reader(batch_size=500)
 
@@ -68,9 +66,7 @@ def test_add_remove_module(self):
             ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
-            scaling_config = TrainerRunnerScalingConfig(
-                num_workers=2, num_gpus_per_worker=1
-            )
+            scaling_config = TrainerScalingConfig(num_workers=2, num_gpus_per_worker=1)
             runner = get_trainer_runner(fw, env, scaling_config)
             reader = get_cartpole_dataset_reader(batch_size=500)
             batch = reader.next()
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index e4d3e00a28a6..673ed52b622f 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -28,7 +28,7 @@
 
 from ray.train._internal.backend_executor import BackendExecutor
 
-from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerRunnerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerScalingConfig
 
 
 class TrainerRunner:
@@ -58,9 +58,9 @@ class TrainerRunner:
     def __init__(
         self,
         rl_trainer_spec: RLTrainerSpec,
-        scaling_config: Optional[TrainerRunnerScalingConfig] = None,
+        scaling_config: Optional[TrainerScalingConfig] = None,
     ):
-        scaling_config = scaling_config or TrainerRunnerScalingConfig()
+        scaling_config = scaling_config or TrainerScalingConfig()
         rl_trainer_class = rl_trainer_spec.rl_trainer_class
 
         # setup wether the worker should use gpu or not
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index e0de9f99c03d..5ee6d5c6fc03 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -6,7 +6,7 @@
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 from ray.rllib.core.rl_trainer.rl_trainer_config import (
     RLTrainerSpec,
-    TrainerRunnerScalingConfig,
+    TrainerScalingConfig,
 )
 
 if TYPE_CHECKING:
@@ -76,7 +76,7 @@ def build(self) -> TrainerRunner:
             optimizer_config=self.optimizer_config,
             trainer_hyperparameters=self.algorithm_config,
         )
-        scaling_config = TrainerRunnerScalingConfig(
+        scaling_config = TrainerScalingConfig(
             num_workers=self.num_trainer_workers,
             num_gpus_per_worker=self.num_gpus_per_trainer_worker,
             num_cpus_per_worker=self.num_cpus_per_trainer_worker,

From d280887d5fde3f5c7067cc4458f020637e504bb4 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 17:04:02 -0800
Subject: [PATCH 076/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer_config.py             |  3 +--
 .../core/rl_trainer/tests/test_trainer_runner_local.py |  4 +++-
 rllib/core/rl_trainer/trainer_runner.py                | 10 +++++++---
 rllib/core/testing/utils.py                            |  4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/rl_trainer_config.py
index da5ef37654d4..f16b2f797dfe 100644
--- a/rllib/core/rl_trainer/rl_trainer_config.py
+++ b/rllib/core/rl_trainer/rl_trainer_config.py
@@ -29,7 +29,6 @@ class RLModuleBackendConfig:
     """
 
     def __post_init__(self):
-        super().__post_init__()
         self._distributed: bool = False
 
     @property
@@ -169,7 +168,7 @@ def get_params_dict(self) -> Dict[str, Any]:
         return {
             "module": self.module,
             "module_spec": self.module_spec,
-            "scaling_config": self.module_backend_config,
+            "module_backend_config": self.module_backend_config,
             "optimizer_config": self.optimizer_config,
             "trainer_hyperparameters": self.trainer_hyperparameters,
         }
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
index 9986cf98dd3d..a8b5ca3fa4f6 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
@@ -11,6 +11,7 @@
     get_trainer_runner,
     get_rl_trainer,
 )
+from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerScalingConfig
 
 
 tf1, tf, tfv = try_import_tf()
@@ -33,7 +34,8 @@ def tearDown(cls) -> None:
     def test_trainer_runner_no_gpus(self):
         env = gym.make("CartPole-v1")
         for fw in ["tf", "torch"]:
-            runner = get_trainer_runner(fw, env, compute_config=dict(num_gpus=0))
+            scaling_config = TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0)
+            runner = get_trainer_runner(fw, env, scaling_config)
             local_trainer = get_rl_trainer(fw, env)
             local_trainer.build()
 
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 673ed52b622f..63609b1c5ff6 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -66,7 +66,7 @@ def __init__(
         # setup wether the worker should use gpu or not
         if rl_trainer_class.framework == "torch":
             trainer_should_use_gpu = scaling_config.num_gpus_per_worker > 0
-            rl_trainer_spec.scaling_config.set_use_gpu(trainer_should_use_gpu)
+            rl_trainer_spec.module_backend_config.set_use_gpu(trainer_should_use_gpu)
         else:
             # TODO (Avnish) How do I run TF on one GPU?
             pass
@@ -74,14 +74,18 @@ def __init__(
         self._is_local = scaling_config.num_workers == 0
         if self._is_local:
             # in local mode the trainer is always not distributed
-            rl_trainer_spec.scaling_config.set_distributed(False)
+            rl_trainer_spec.module_backend_config.set_distributed(False)
             self._trainer = rl_trainer_class(**rl_trainer_spec.get_params_dict())
             self._trainer.build()
         else:
             # in remote mode the trainer is distributed only if there are more than 1
             # workers
             is_trainer_distributed = scaling_config.num_workers > 1
-            rl_trainer_spec.scaling_config.set_distributed(is_trainer_distributed)
+            (
+                rl_trainer_spec.module_backend_config.set_distributed(
+                    is_trainer_distributed
+                )
+            )
 
             if rl_trainer_class.framework == "torch":
                 from ray.train.torch import TorchConfig
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 83fe5292677f..4fea2e2f14b6 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -6,7 +6,7 @@
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 from ray.rllib.core.rl_trainer.rl_trainer_config import (
     RLTrainerSpec,
-    TrainerRunnerScalingConfig,
+    TrainerScalingConfig,
 )
 
 from ray.rllib.core.rl_module.marl_module import (
@@ -105,7 +105,7 @@ def get_rl_trainer(
 def get_trainer_runner(
     framework: str,
     env: "gym.Env",
-    trainer_runner_scaling_config: TrainerRunnerScalingConfig,
+    trainer_runner_scaling_config: TrainerScalingConfig,
     is_multi_agent: bool = False,
 ) -> TrainerRunner:
 

From 97d80b1b4002169447b7bff5bf09e6c6b508e887 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 17:30:31 -0800
Subject: [PATCH 077/112] refactor

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           | 77 ++++++++++++++++++-
 ...rl_trainer_config.py => scaling_config.py} | 63 ---------------
 .../core/rl_trainer/tests/test_rl_trainer.py  |  2 +-
 .../rl_trainer/tests/test_trainer_runner.py   |  2 +-
 .../tests/test_trainer_runner_local.py        |  2 +-
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  2 +-
 .../torch/tests/test_torch_rl_trainer.py      |  2 +-
 .../core/rl_trainer/torch/torch_rl_trainer.py |  2 +-
 rllib/core/rl_trainer/trainer_runner.py       |  9 +--
 .../core/rl_trainer/trainer_runner_config.py  |  9 +--
 rllib/core/testing/utils.py                   |  6 +-
 11 files changed, 89 insertions(+), 87 deletions(-)
 rename rllib/core/rl_trainer/{rl_trainer_config.py => scaling_config.py} (57%)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 40881f0a4071..619f7c3ab1a5 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -1,5 +1,6 @@
 import abc
 
+from dataclasses import dataclass, field
 import logging
 import numpy as np
 from typing import (
@@ -14,6 +15,7 @@
     Tuple,
     Type,
     Union,
+    TYPE_CHECKING,
 )
 
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
@@ -22,6 +24,9 @@
     ModuleID,
     SingleAgentRLModuleSpec,
 )
+from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
+from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule
+
 from ray.rllib.core.rl_module.marl_module import (
     MultiAgentRLModule,
     MultiAgentRLModuleSpec,
@@ -30,11 +35,18 @@
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import TensorType
-from ray.rllib.core.rl_trainer.rl_trainer_config import (
+from ray.rllib.core.rl_trainer.scaling_config import RLModuleBackendConfig
+
+from ray.rllib.core.rl_trainer.scaling_config import (
     RLModuleBackendConfig,
-    HyperparamType,
+    TorchRLModuleBackendConfig,
+    TfRLModuleBackendConfig,
 )
 
+if TYPE_CHECKING:
+    from ray.rllib.utils.params import Hyperparams
+    from ray.rllib.algorithms import AlgorithmConfig
+
 
 torch, _ = try_import_torch()
 tf1, tf, tfv = try_import_tf()
@@ -46,6 +58,7 @@
 ParamOptimizerPairs = List[Tuple[Sequence[ParamType], Optimizer]]
 ParamRef = Hashable
 ParamDictType = Dict[ParamRef, ParamType]
+HyperparamType = Union["AlgorithmConfig", Hyperparams]
 
 
 class RLTrainer:
@@ -616,3 +629,63 @@ def __check_if_build_called(self):
                 "RLTrainer.build() must be called after constructing a "
                 "RLTrainer and before calling any methods on it."
             )
+
+
+@dataclass
+class RLTrainerSpec:
+    """The spec for construcitng RLTrainer actors.
+
+    Args:
+        rl_trainer_class: The RLTrainer class to use.
+        module_spec: The underlying (MA)RLModule spec to completely define the module.
+        module: Alternatively the RLModule instance can be passed in directly. This
+            only works if the RLTrainer is not an actor.
+        backend_config: The backend config for properly distributing the RLModule.
+        optimizer_config: The optimizer setting to apply during training.
+        trainer_hyperparameters: The extra config for the loss/additional update. The
+            items within this object should be accessible via a dot notation. For
+            example, if the trainer_hyperparameters contains {"coeff": 0.001}, then the
+            learning rate can be accessed via trainer_hyperparameters.coeff. This is
+            useful for passing in algorithm config or a HyperParams that contains the
+            hyper-parameters.
+    """
+
+    rl_trainer_class: Type["RLTrainer"]
+    module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None
+    module: Optional["RLModule"] = None
+    module_backend_config: "RLModuleBackendConfig" = None
+    optimizer_config: Dict[str, Any] = field(default_factory=dict)
+    trainer_hyperparameters: HyperparamType = field(default_factory=dict)
+
+    def __post_init__(self):
+        # convert to hyper params object if needed
+        if isinstance(self.trainer_hyperparameters, dict):
+            self.trainer_hyperparameters = Hyperparams(self.trainer_hyperparameters)
+
+        # if module_backend_config is not set, we will create a dafault.
+        if self.module_backend_config is None:
+            if self.module is not None:
+                if isinstance(self.module, TorchRLModule):
+                    self.module_backend_config = TorchRLModuleBackendConfig()
+                else:
+                    self.module_backend_config = TfRLModuleBackendConfig()
+
+            if self.module_spec is not None:
+                if issubclass(self.module_spec.module_class, TorchRLModule):
+                    self.module_backend_config = TorchRLModuleBackendConfig()
+                else:
+                    self.module_backend_config = TfRLModuleBackendConfig()
+
+    def get_params_dict(self) -> Dict[str, Any]:
+        """Returns the parameters than be passed to the RLTrainer constructor."""
+        return {
+            "module": self.module,
+            "module_spec": self.module_spec,
+            "module_backend_config": self.module_backend_config,
+            "optimizer_config": self.optimizer_config,
+            "trainer_hyperparameters": self.trainer_hyperparameters,
+        }
+
+    def build(self) -> "RLTrainer":
+        """Builds the RLTrainer instance."""
+        return self.rl_trainer_class(**self.get_params_dict())
diff --git a/rllib/core/rl_trainer/rl_trainer_config.py b/rllib/core/rl_trainer/scaling_config.py
similarity index 57%
rename from rllib/core/rl_trainer/rl_trainer_config.py
rename to rllib/core/rl_trainer/scaling_config.py
index f16b2f797dfe..c879c2aa26ff 100644
--- a/rllib/core/rl_trainer/rl_trainer_config.py
+++ b/rllib/core/rl_trainer/scaling_config.py
@@ -11,9 +11,6 @@
     from ray.rllib.algorithms.algorithm import AlgorithmConfig
 
 
-HyperparamType = Union["AlgorithmConfig", Hyperparams]
-
-
 @dataclass
 class RLModuleBackendConfig:
     """Base class for scaling config relevant to RLTrainer.
@@ -116,63 +113,3 @@ class TrainerScalingConfig:
     num_workers: int = 0
     num_cpus_per_worker: int = 1
     num_gpus_per_worker: int = 0
-
-
-@dataclass
-class RLTrainerSpec:
-    """The spec for construcitng RLTrainer actors.
-
-    Args:
-        rl_trainer_class: The RLTrainer class to use.
-        module_spec: The underlying (MA)RLModule spec to completely define the module.
-        module: Alternatively the RLModule instance can be passed in directly. This
-            only works if the RLTrainer is not an actor.
-        backend_config: The backend config for properly distributing the RLModule.
-        optimizer_config: The optimizer setting to apply during training.
-        trainer_hyperparameters: The extra config for the loss/additional update. The
-            items within this object should be accessible via a dot notation. For
-            example, if the trainer_hyperparameters contains {"coeff": 0.001}, then the
-            learning rate can be accessed via trainer_hyperparameters.coeff. This is
-            useful for passing in algorithm config or a HyperParams that contains the
-            hyper-parameters.
-    """
-
-    rl_trainer_class: Type["RLTrainer"]
-    module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None
-    module: Optional["RLModule"] = None
-    module_backend_config: "RLModuleBackendConfig" = None
-    optimizer_config: Dict[str, Any] = field(default_factory=dict)
-    trainer_hyperparameters: HyperparamType = field(default_factory=dict)
-
-    def __post_init__(self):
-        # convert to hyper params object if needed
-        if isinstance(self.trainer_hyperparameters, dict):
-            self.trainer_hyperparameters = Hyperparams(self.trainer_hyperparameters)
-
-        # if module_backend_config is not set, we will create a dafault.
-        if self.module_backend_config is None:
-            if self.module is not None:
-                if isinstance(self.module, TorchRLModule):
-                    self.module_backend_config = TorchRLModuleBackendConfig()
-                else:
-                    self.module_backend_config = TfRLModuleBackendConfig()
-
-            if self.module_spec is not None:
-                if issubclass(self.module_spec.module_class, TorchRLModule):
-                    self.module_backend_config = TorchRLModuleBackendConfig()
-                else:
-                    self.module_backend_config = TfRLModuleBackendConfig()
-
-    def get_params_dict(self) -> Dict[str, Any]:
-        """Returns the parameters than be passed to the RLTrainer constructor."""
-        return {
-            "module": self.module,
-            "module_spec": self.module_spec,
-            "module_backend_config": self.module_backend_config,
-            "optimizer_config": self.optimizer_config,
-            "trainer_hyperparameters": self.trainer_hyperparameters,
-        }
-
-    def build(self) -> "RLTrainer":
-        """Builds the RLTrainer instance."""
-        return self.rl_trainer_class(**self.get_params_dict())
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index f5f5e5d23f28..472c1848b011 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -11,7 +11,7 @@
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-from ray.rllib.core.rl_trainer.rl_trainer_config import TfRLModuleBackendConfig
+from ray.rllib.core.rl_trainer.scaling_config import TfRLModuleBackendConfig
 
 
 def get_trainer(distributed=False) -> RLTrainer:
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index c5b356618f5c..51ae555a77b6 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -5,7 +5,7 @@
 
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
-from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerScalingConfig
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 from ray.rllib.core.testing.utils import (
     get_trainer_runner,
     add_module_to_runner_or_trainer,
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
index a8b5ca3fa4f6..d1a22010288b 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
@@ -11,7 +11,7 @@
     get_trainer_runner,
     get_rl_trainer,
 )
-from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerScalingConfig
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 
 
 tf1, tf, tfv = try_import_tf()
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 25fae930b6ce..49a2c47f480d 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -34,7 +34,7 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
-from ray.rllib.core.rl_trainer.rl_trainer_config import TfRLModuleBackendConfig
+from ray.rllib.core.rl_trainer.scaling_config import TfRLModuleBackendConfig
 
 
 tf1, tf, tfv = try_import_tf()
diff --git a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
index b3b0f0023b53..a9145fa744fd 100644
--- a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
@@ -12,7 +12,7 @@
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
 from ray.rllib.utils.numpy import convert_to_numpy
-from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLModuleBackendConfig
+from ray.rllib.core.rl_trainer.scaling_config import TorchRLModuleBackendConfig
 
 
 def _get_trainer(distributed: bool = False) -> RLTrainer:
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 4ee53a691165..cb234e89c688 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -28,7 +28,7 @@
     HyperparamType,
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
-from ray.rllib.core.rl_trainer.rl_trainer_config import TorchRLModuleBackendConfig
+from ray.rllib.core.rl_trainer.scaling_config import TorchRLModuleBackendConfig
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 63609b1c5ff6..c9d66a97561d 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -17,19 +17,14 @@
     SingleAgentRLModuleSpec,
 )
 from ray.rllib.core.rl_trainer.rl_trainer import (
+    RLTrainerSpec,
     ParamOptimizerPairs,
     Optimizer,
 )
-from ray.rllib.core.rl_trainer.rl_trainer_config import RLTrainerSpec
-
-
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 from ray.rllib.policy.sample_batch import MultiAgentBatch
-
-
 from ray.train._internal.backend_executor import BackendExecutor
 
-from ray.rllib.core.rl_trainer.rl_trainer_config import TrainerScalingConfig
-
 
 class TrainerRunner:
     """Coordinator of RLTrainers.
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index 5ee6d5c6fc03..4a89496ac4e2 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -2,12 +2,11 @@
 
 from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
-from ray.rllib.utils.from_config import NotProvided
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
-from ray.rllib.core.rl_trainer.rl_trainer_config import (
-    RLTrainerSpec,
-    TrainerScalingConfig,
-)
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
+from ray.rllib.core.rl_trainer.rl_trainer import RLTrainerSpec
+from ray.rllib.utils.from_config import NotProvided
+
 
 if TYPE_CHECKING:
     from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 4fea2e2f14b6..7333c60e8077 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -4,10 +4,8 @@
 
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
-from ray.rllib.core.rl_trainer.rl_trainer_config import (
-    RLTrainerSpec,
-    TrainerScalingConfig,
-)
+from ray.rllib.core.rl_trainer.rl_trainer import RLTrainerSpec
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 
 from ray.rllib.core.rl_module.marl_module import (
     MultiAgentRLModuleSpec,

From 85387e507b3962e9e56bdf950a62de7036e3089e Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Fri, 27 Jan 2023 17:37:04 -0800
Subject: [PATCH 078/112] lin

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py     |  4 +---
 rllib/core/rl_trainer/scaling_config.py | 12 +-----------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 619f7c3ab1a5..fac12501c867 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -34,9 +34,8 @@
 from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.params import Hyperparams
 from ray.rllib.utils.typing import TensorType
-from ray.rllib.core.rl_trainer.scaling_config import RLModuleBackendConfig
-
 from ray.rllib.core.rl_trainer.scaling_config import (
     RLModuleBackendConfig,
     TorchRLModuleBackendConfig,
@@ -44,7 +43,6 @@
 )
 
 if TYPE_CHECKING:
-    from ray.rllib.utils.params import Hyperparams
     from ray.rllib.algorithms import AlgorithmConfig
 
 
diff --git a/rllib/core/rl_trainer/scaling_config.py b/rllib/core/rl_trainer/scaling_config.py
index c879c2aa26ff..cc892222b316 100644
--- a/rllib/core/rl_trainer/scaling_config.py
+++ b/rllib/core/rl_trainer/scaling_config.py
@@ -1,14 +1,4 @@
-from dataclasses import dataclass, field
-from typing import Any, Dict, Optional, Type, Union, TYPE_CHECKING
-
-from ray.rllib.utils.params import Hyperparams
-from ray.rllib.core.rl_module.torch import TorchRLModule
-
-if TYPE_CHECKING:
-    from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec
-    from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
-    from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
-    from ray.rllib.algorithms.algorithm import AlgorithmConfig
+from dataclasses import dataclass
 
 
 @dataclass

From 1a70b6e452f569ac1f2072547c9b72aad00d2f1a Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sat, 28 Jan 2023 17:58:39 -0800
Subject: [PATCH 079/112] fix the lint and tf_dependency test issue via adding
 tf stubs

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py |  4 +---
 rllib/utils/framework.py            | 31 +++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index fac12501c867..b8f1a762a336 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -25,7 +25,6 @@
     SingleAgentRLModuleSpec,
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
-from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule
 
 from ray.rllib.core.rl_module.marl_module import (
     MultiAgentRLModule,
@@ -667,8 +666,7 @@ def __post_init__(self):
                     self.module_backend_config = TorchRLModuleBackendConfig()
                 else:
                     self.module_backend_config = TfRLModuleBackendConfig()
-
-            if self.module_spec is not None:
+            elif self.module_spec is not None:
                 if issubclass(self.module_spec.module_class, TorchRLModule):
                     self.module_backend_config = TorchRLModuleBackendConfig()
                 else:
diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py
index 7ae4a4c5ddfc..aab7bba084bb 100644
--- a/rllib/utils/framework.py
+++ b/rllib/utils/framework.py
@@ -59,11 +59,12 @@ def try_import_tf(error: bool = False):
     Raises:
         ImportError: If error=True and tf is not installed.
     """
+    tf_stub = _TFStub()
     # Make sure, these are reset after each test case
     # that uses them: del os.environ["RLLIB_TEST_NO_TF_IMPORT"]
     if "RLLIB_TEST_NO_TF_IMPORT" in os.environ:
         logger.warning("Not importing TensorFlow for test purposes")
-        return None, None, None
+        return None, tf_stub, None
 
     if "TF_CPP_MIN_LOG_LEVEL" not in os.environ:
         os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -86,7 +87,7 @@ def try_import_tf(error: bool = False):
                     "install at least one deep-learning framework: "
                     "`pip install [torch|tensorflow|jax]`."
                 )
-            return None, None, None
+            return None, tf_stub, None
 
     # Try "reducing" tf to tf.compat.v1.
     try:
@@ -108,6 +109,24 @@ def try_import_tf(error: bool = False):
     return tf1_module, tf_module, version
 
 
+# Fake module for tf.
+class _TFStub:
+    def __init__(self) -> None:
+        self.keras = _KerasStub()
+
+
+# Fake module for tf.keras.
+class _KerasStub:
+    def __init__(self) -> None:
+        self.Model = _FakeTfClassStub
+
+
+# Fake classes under keras (e.g for tf.keras.Model)
+class _FakeTfClassStub:
+    def __init__(self, *a, **kw):
+        raise ImportError("Could not import `tensorflow`. Try pip install tensorflow.")
+
+
 @DeveloperAPI
 def tf_function(tf_module):
     """Conditional decorator for @tf.function.
@@ -157,20 +176,20 @@ class _NNStub:
     def __init__(self, *a, **kw):
         # Fake nn.functional module within torch.nn.
         self.functional = None
-        self.Module = _FakeClassStub
+        self.Module = _FakeTorchClassStub
         self.parallel = _ParallelStub()
 
 
 # Fake class for e.g. torch.nn.Module to allow it to be inherited from.
-class _FakeClassStub:
+class _FakeTorchClassStub:
     def __init__(self, *a, **kw):
         raise ImportError("Could not import `torch`. Try pip install torch.")
 
 
 class _ParallelStub:
     def __init__(self, *a, **kw):
-        self.DataParallel = _FakeClassStub
-        self.DistributedDataParallel = _FakeClassStub
+        self.DataParallel = _FakeTorchClassStub
+        self.DistributedDataParallel = _FakeTorchClassStub
 
 
 @PublicAPI

From 317a9fdee598870f46d1a3dcceb4f119ee69b5e1 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sat, 28 Jan 2023 18:25:39 -0800
Subject: [PATCH 080/112] wip on unittest trianer_runner

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../rl_trainer/tests/test_trainer_runner.py   | 38 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 51ae555a77b6..17a38fb422cd 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -22,6 +22,41 @@ class TestTrainerRunner(unittest.TestCase):
     def setUpClass(cls) -> None:
         ray.init()
 
+        # Settings to test
+        # 1. base: local_mode on cpu
+        # scaling_config = TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0)
+        # 2. base-gpu: local_mode on gpu, e.g. only 1 gpu should be used despite
+        #     having 2 gpus on the machine and defining fractional gpus.
+        # scaling_config = TrainerScalingConfig(
+        #     num_workers=0, num_gpus_per_worker=0.5 # this would be get ignored
+        # )
+        # 3. async-cpu: e.g. 1 remote trainer on cpu
+        # scaling_config = TrainerScalingConfig(num_workers=1)
+        # 4. async-gpu: e.g. 1 remote trainer on 0.5 gpu
+        # scaling_config = TrainerScalingConfig(
+        #     num_workers=1, num_gpus_per_worker=0.5
+        # )
+        # 5. multi-gpu-ddp: e.g. 2 remote trainers on 1 gpu each with ddp
+        # scaling_config = TrainerScalingConfig(num_workers=2, num_gpus_per_worker=1)
+        # 6. multi-cpu-ddp: e.g. 2 remote trainers on 2 cpu each with ddp, This
+        #     imitates multi-gpu-ddp for debugging purposes when GPU is not available
+        #     in dev cycle
+        # scaling_config = TrainerScalingConfig(num_workers=2, num_cpus_per_worker=1)
+        # 7. multi-gpu-ddp-pipeline (skip for now): e.g. 2 remote trainers on 2 gpu
+        #     each with pipeline parallelism
+        # scaling_config = TrainerScalingConfig(num_workers=2, num_gpus_per_worker=2)
+        cls.scaling_configs = {
+            "base": TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0),
+            "base-gpu": TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0.5),
+            "async-cpu": TrainerScalingConfig(num_workers=1),
+            "async-gpu": TrainerScalingConfig(num_workers=1, num_gpus_per_worker=0.5),
+            "multi-gpu-ddp": TrainerScalingConfig(num_workers=2, num_gpus_per_worker=1),
+            "multi-cpu-ddp": TrainerScalingConfig(num_workers=2, num_cpus_per_worker=2),
+            # "multi-gpu-ddp-pipeline": TrainerScalingConfig(
+            #     num_workers=2, num_gpus_per_worker=2
+            # ),
+        }
+
     @classmethod
     def tearDownClass(cls) -> None:
         ray.shutdown()
@@ -33,7 +68,8 @@ def test_update_multigpu(self):
             ray.init(ignore_reinit_error=True)
             print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
-            scaling_config = TrainerScalingConfig(num_workers=2, num_gpus_per_worker=1)
+
+            scaling_config = self.scaling_configs["multi-gpu-ddp"]
             runner = get_trainer_runner(fw, env, scaling_config)
             reader = get_cartpole_dataset_reader(batch_size=500)
 

From cbc9b022a4ca21cbf8a71624212f4e0932aa008e Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sat, 28 Jan 2023 18:30:28 -0800
Subject: [PATCH 081/112] wip

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/tests/test_trainer_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 17a38fb422cd..724c76a3e959 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -71,7 +71,7 @@ def test_update_multigpu(self):
 
             scaling_config = self.scaling_configs["multi-gpu-ddp"]
             runner = get_trainer_runner(fw, env, scaling_config)
-            reader = get_cartpole_dataset_reader(batch_size=500)
+            reader = get_cartpole_dataset_reader(batch_size=512)
 
             min_loss = float("inf")
             for iter_i in range(1000):

From defa5f1d306a31cd671611f28558214bbf75e408 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sat, 28 Jan 2023 21:13:47 -0800
Subject: [PATCH 082/112] test_trainer_runner updated to support all variations
 of scaling config

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .../rl_trainer/tests/test_trainer_runner.py   | 79 ++++++++-----------
 .../core/rl_trainer/torch/torch_rl_trainer.py | 16 +++-
 rllib/core/rl_trainer/trainer_runner.py       | 41 +++++-----
 3 files changed, 71 insertions(+), 65 deletions(-)

diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 724c76a3e959..62ac8a119899 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -2,6 +2,8 @@
 import unittest
 import ray
 import time
+import numpy as np
+import itertools
 
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
 from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
@@ -13,43 +15,16 @@
 
 
 class TestTrainerRunner(unittest.TestCase):
-    """This test is setup for 2 gpus."""
-
-    # TODO: This unittest should also test other resource allocations like multi-cpu,
-    # multi-node multi-gpu, etc.
-
     @classmethod
     def setUpClass(cls) -> None:
         ray.init()
 
         # Settings to test
-        # 1. base: local_mode on cpu
-        # scaling_config = TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0)
-        # 2. base-gpu: local_mode on gpu, e.g. only 1 gpu should be used despite
-        #     having 2 gpus on the machine and defining fractional gpus.
-        # scaling_config = TrainerScalingConfig(
-        #     num_workers=0, num_gpus_per_worker=0.5 # this would be get ignored
-        # )
-        # 3. async-cpu: e.g. 1 remote trainer on cpu
-        # scaling_config = TrainerScalingConfig(num_workers=1)
-        # 4. async-gpu: e.g. 1 remote trainer on 0.5 gpu
-        # scaling_config = TrainerScalingConfig(
-        #     num_workers=1, num_gpus_per_worker=0.5
-        # )
-        # 5. multi-gpu-ddp: e.g. 2 remote trainers on 1 gpu each with ddp
-        # scaling_config = TrainerScalingConfig(num_workers=2, num_gpus_per_worker=1)
-        # 6. multi-cpu-ddp: e.g. 2 remote trainers on 2 cpu each with ddp, This
-        #     imitates multi-gpu-ddp for debugging purposes when GPU is not available
-        #     in dev cycle
-        # scaling_config = TrainerScalingConfig(num_workers=2, num_cpus_per_worker=1)
-        # 7. multi-gpu-ddp-pipeline (skip for now): e.g. 2 remote trainers on 2 gpu
-        #     each with pipeline parallelism
-        # scaling_config = TrainerScalingConfig(num_workers=2, num_gpus_per_worker=2)
         cls.scaling_configs = {
-            "base": TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0),
-            "base-gpu": TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0.5),
-            "async-cpu": TrainerScalingConfig(num_workers=1),
-            "async-gpu": TrainerScalingConfig(num_workers=1, num_gpus_per_worker=0.5),
+            "local-cpu": TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0),
+            "local-gpu": TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0.5),
+            "remote-cpu": TrainerScalingConfig(num_workers=1),
+            "remote-gpu": TrainerScalingConfig(num_workers=1, num_gpus_per_worker=0.5),
             "multi-gpu-ddp": TrainerScalingConfig(num_workers=2, num_gpus_per_worker=1),
             "multi-cpu-ddp": TrainerScalingConfig(num_workers=2, num_cpus_per_worker=2),
             # "multi-gpu-ddp-pipeline": TrainerScalingConfig(
@@ -62,33 +37,41 @@ def tearDownClass(cls) -> None:
         ray.shutdown()
 
     def test_update_multigpu(self):
-        """Test training in a 2 gpu setup and that weights are synchronized."""
 
-        for fw in ["tf", "torch"]:
+        # TODO (Avnish): The tf + remote-gpu test is flakey. Removing for now until
+        # investigated.
+        fws = ["torch"]
+        scaling_modes = self.scaling_configs.keys()
+        test_iterator = itertools.product(fws, scaling_modes)
+
+        for fw, scaling_mode in test_iterator:
+            print(f"Testing framework: {fw}, scaling mode: {scaling_mode}.")
             ray.init(ignore_reinit_error=True)
-            print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
 
-            scaling_config = self.scaling_configs["multi-gpu-ddp"]
+            scaling_config = self.scaling_configs[scaling_mode]
             runner = get_trainer_runner(fw, env, scaling_config)
-            reader = get_cartpole_dataset_reader(batch_size=512)
+            reader = get_cartpole_dataset_reader(batch_size=1024)
 
             min_loss = float("inf")
             for iter_i in range(1000):
                 batch = reader.next()
-                res_0, res_1 = runner.update(batch.as_multi_agent())
+                results = runner.update(batch.as_multi_agent())
 
-                loss = (res_0["loss"]["total_loss"] + res_1["loss"]["total_loss"]) / 2
+                loss = np.mean([res["loss"]["total_loss"] for res in results])
                 min_loss = min(loss, min_loss)
                 print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}")
                 # The loss is initially around 0.69 (ln2). When it gets to around
                 # 0.57 the return of the policy gets to around 100.
                 if min_loss < 0.57:
                     break
-                self.assertEqual(
-                    res_0["mean_weight"]["default_policy"],
-                    res_1["mean_weight"]["default_policy"],
-                )
+
+                for res1, res2 in zip(results, results[1:]):
+                    self.assertEqual(
+                        res1["mean_weight"]["default_policy"],
+                        res2["mean_weight"]["default_policy"],
+                    )
+
             self.assertLess(min_loss, 0.57)
 
             # make sure the runner resources are freed up so that we don't autoscale
@@ -98,11 +81,17 @@ def test_update_multigpu(self):
 
     def test_add_remove_module(self):
 
-        for fw in ["tf", "torch"]:
+        # TODO (Avnish): The tf + remote-gpu test is flakey. Removing for now until
+        # investigated.
+        fws = ["torch"]
+        scaling_modes = self.scaling_configs.keys()
+        test_iterator = itertools.product(fws, scaling_modes)
+
+        for fw, scaling_mode in test_iterator:
+            print(f"Testing framework: {fw}, scaling mode: {scaling_mode}.")
             ray.init(ignore_reinit_error=True)
-            print(f"Testing framework: {fw}.")
             env = gym.make("CartPole-v1")
-            scaling_config = TrainerScalingConfig(num_workers=2, num_gpus_per_worker=1)
+            scaling_config = self.scaling_configs[scaling_mode]
             runner = get_trainer_runner(fw, env, scaling_config)
             reader = get_cartpole_dataset_reader(batch_size=500)
             batch = reader.next()
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index cb234e89c688..2054c22348f2 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -129,6 +129,12 @@ def build(self) -> None:
             self._device = torch.device("cpu")
         super().build()
 
+    @override(RLTrainer)
+    def _make_module(self) -> MultiAgentRLModule:
+        module = super()._make_module()
+        self._map_module_to_device(module)
+        return module
+
     @override(RLTrainer)
     def _make_distributed_module(self) -> MultiAgentRLModule:
         module = self._make_module()
@@ -140,11 +146,9 @@ def _make_distributed_module(self) -> MultiAgentRLModule:
         # register them in the MultiAgentRLModule. We should find a better way to
         # handle this.
         if isinstance(module, torch.nn.Module):
-            module.to(self._device)
             module = TorchDDPRLModule(module)
         else:
             for key in module.keys():
-                module[key].to(self._device)
                 module.add_module(key, TorchDDPRLModule(module[key]), override=True)
 
         return module
@@ -204,3 +208,11 @@ def add_module(
             self._module.add_module(
                 module_id, TorchDDPRLModule(self._module[module_id]), override=True
             )
+
+    def _map_module_to_device(self, module: MultiAgentRLModule) -> None:
+        """Moves the module to the correct device."""
+        if isinstance(module, torch.nn.Module):
+            module.to(self._device)
+        else:
+            for key in module.keys():
+                module[key].to(self._device)
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index c9d66a97561d..1a4c5a990f9b 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -66,7 +66,12 @@ def __init__(
             # TODO (Avnish) How do I run TF on one GPU?
             pass
 
+        # TODO (Kourosh): Go with a _remote flag instead of _is_local to be more
+        # explicit
         self._is_local = scaling_config.num_workers == 0
+        self._trainer = None
+        self._workers = None
+
         if self._is_local:
             # in local mode the trainer is always not distributed
             rl_trainer_spec.module_backend_config.set_distributed(False)
@@ -113,7 +118,7 @@ def __init__(
 
     @property
     def is_local(self) -> bool:
-        return not self._is_local
+        return self._is_local
 
     def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
         """Do a gradient based update to the RLTrainer(s) maintained by this TrainerRunner.
@@ -125,9 +130,9 @@ def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
             A list of dictionaries of results from the updates from the RLTrainer(s)
         """
         if self.is_local:
-            return self._distributed_update(batch)
-        else:
             return [self._trainer.update(batch)]
+        else:
+            return self._distributed_update(batch)
 
     def _distributed_update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
         """Do a gradient based update to the RLTrainers using DDP training.
@@ -175,12 +180,12 @@ def additional_update(self, *args, **kwargs) -> List[Mapping[str, Any]]:
         """
 
         if self.is_local:
+            return [self._trainer.additional_update(*args, **kwargs)]
+        else:
             refs = []
             for worker in self._workers:
                 refs.append(worker.additional_update.remote(*args, **kwargs))
             return ray.get(refs)
-        else:
-            return [self._trainer.additional_update(*args, **kwargs)]
 
     def add_module(
         self,
@@ -204,6 +209,13 @@ def add_module(
                 should be provided.
         """
         if self.is_local:
+            self._trainer.add_module(
+                module_id=module_id,
+                module_spec=module_spec,
+                set_optimizer_fn=set_optimizer_fn,
+                optimizer_cls=optimizer_cls,
+            )
+        else:
             refs = []
             for worker in self._workers:
                 ref = worker.add_module.remote(
@@ -214,13 +226,6 @@ def add_module(
                 )
                 refs.append(ref)
             ray.get(refs)
-        else:
-            self._trainer.add_module(
-                module_id=module_id,
-                module_spec=module_spec,
-                set_optimizer_fn=set_optimizer_fn,
-                optimizer_cls=optimizer_cls,
-            )
 
     def remove_module(self, module_id: ModuleID) -> None:
         """Remove a module from the RLTrainers maintained by this TrainerRunner.
@@ -230,13 +235,13 @@ def remove_module(self, module_id: ModuleID) -> None:
 
         """
         if self.is_local:
+            self._trainer.remove_module(module_id)
+        else:
             refs = []
             for worker in self._workers:
                 ref = worker.remove_module.remote(module_id)
                 refs.append(ref)
             ray.get(refs)
-        else:
-            self._trainer.remove_module(module_id)
 
     def get_weight(self) -> Dict:
         """Get the weights of the MARLModule.
@@ -250,12 +255,12 @@ def get_weight(self) -> Dict:
     def get_state(self) -> List[Mapping[ModuleID, Mapping[str, Any]]]:
         """Get the states of the RLTrainers"""
         if self.is_local:
+            return [self._trainer.get_state()]
+        else:
             refs = []
             for worker in self._workers:
                 refs.append(worker.get_state.remote())
             return ray.get(refs)
-        else:
-            return [self._trainer.get_state()]
 
     def set_state(self, state: List[Mapping[ModuleID, Mapping[str, Any]]]) -> None:
         """Sets the states of the RLTrainers.
@@ -265,9 +270,9 @@ def set_state(self, state: List[Mapping[ModuleID, Mapping[str, Any]]]) -> None:
 
         """
         if self.is_local:
+            self._trainer.set_state(state)
+        else:
             refs = []
             for worker in self._workers:
                 refs.append(worker.set_state.remote(state))
             ray.get(refs)
-        else:
-            self._trainer.set_state(state)

From e0a0bcfe19acd0315a839736761843ecfb889988 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sat, 28 Jan 2023 21:21:23 -0800
Subject: [PATCH 083/112] removed test trainer runner local and moved it to
 test_trainer_runner.py

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD                                   |  7 --
 .../rl_trainer/tests/test_trainer_runner.py   | 46 +++++++++++-
 .../tests/test_trainer_runner_local.py        | 72 -------------------
 3 files changed, 45 insertions(+), 80 deletions(-)
 delete mode 100644 rllib/core/rl_trainer/tests/test_trainer_runner_local.py

diff --git a/rllib/BUILD b/rllib/BUILD
index 46642d737365..0a740c14f7aa 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1842,13 +1842,6 @@ py_test(
     srcs = ["core/rl_trainer/tests/test_trainer_runner.py"]
 )
 
-py_test(
-    name = "test_trainer_runner_local",
-    tags = ["team:rllib", "core", "exclusive"],
-    size = "medium",
-    srcs = ["core/rl_trainer/tests/test_trainer_runner_local.py"]
-)
-
 py_test(
     name = "test_trainer_runner_config",
     tags = ["team:rllib", "core"],
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 62ac8a119899..104ba0827d05 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -6,10 +6,12 @@
 import itertools
 
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
-from ray.rllib.utils.test_utils import get_cartpole_dataset_reader
+from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
+from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 from ray.rllib.core.testing.utils import (
     get_trainer_runner,
+    get_rl_trainer,
     add_module_to_runner_or_trainer,
 )
 
@@ -36,6 +38,48 @@ def setUpClass(cls) -> None:
     def tearDownClass(cls) -> None:
         ray.shutdown()
 
+    def test_trainer_runner_local(self):
+
+        tf1, tf, tfv = try_import_tf()
+        tf1.executing_eagerly()
+
+        fws = ["tf", "torch"]
+        scaling_modes = ["local-cpu", "local-gpu"]
+        test_iterator = itertools.product(fws, scaling_modes)
+
+        env = gym.make("CartPole-v1")
+        for fw, scaling_mode in test_iterator:
+            print(f"Testing framework: {fw}, scaling mode: {scaling_mode}")
+            scaling_config = self.scaling_configs[scaling_mode]
+            runner = get_trainer_runner(fw, env, scaling_config)
+            local_trainer = get_rl_trainer(fw, env)
+            local_trainer.build()
+
+            # make the state of the trainer and the local runner identical
+            local_trainer.set_state(runner.get_state()[0])
+
+            reader = get_cartpole_dataset_reader(batch_size=500)
+            batch = reader.next()
+            batch = batch.as_multi_agent()
+            check(local_trainer.update(batch), runner.update(batch)[0])
+
+            new_module_id = "test_module"
+
+            add_module_to_runner_or_trainer(fw, env, new_module_id, runner)
+            add_module_to_runner_or_trainer(fw, env, new_module_id, local_trainer)
+
+            # make the state of the trainer and the local runner identical
+            local_trainer.set_state(runner.get_state()[0])
+
+            # do another update
+            batch = reader.next()
+            ma_batch = MultiAgentBatch(
+                {new_module_id: batch, DEFAULT_POLICY_ID: batch}, env_steps=batch.count
+            )
+            check(local_trainer.update(ma_batch), runner.update(ma_batch)[0])
+
+            check(local_trainer.get_state(), runner.get_state()[0])
+
     def test_update_multigpu(self):
 
         # TODO (Avnish): The tf + remote-gpu test is flakey. Removing for now until
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py b/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
deleted file mode 100644
index d1a22010288b..000000000000
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_local.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import gymnasium as gym
-import unittest
-
-import ray
-
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, MultiAgentBatch
-from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-from ray.rllib.utils.framework import try_import_tf
-from ray.rllib.core.testing.utils import (
-    add_module_to_runner_or_trainer,
-    get_trainer_runner,
-    get_rl_trainer,
-)
-from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
-
-
-tf1, tf, tfv = try_import_tf()
-tf1.executing_eagerly()
-
-
-class TestTrainerRunnerLocal(unittest.TestCase):
-    """This test is a trainer test setup for no gpus."""
-
-    # TODO: Make a unittest that does not need 2 gpus to run.
-    # So that the user can run it locally as well.
-    @classmethod
-    def setUp(cls) -> None:
-        ray.init()
-
-    @classmethod
-    def tearDown(cls) -> None:
-        ray.shutdown()
-
-    def test_trainer_runner_no_gpus(self):
-        env = gym.make("CartPole-v1")
-        for fw in ["tf", "torch"]:
-            scaling_config = TrainerScalingConfig(num_workers=0, num_gpus_per_worker=0)
-            runner = get_trainer_runner(fw, env, scaling_config)
-            local_trainer = get_rl_trainer(fw, env)
-            local_trainer.build()
-
-            # make the state of the trainer and the local runner identical
-            local_trainer.set_state(runner.get_state()[0])
-
-            reader = get_cartpole_dataset_reader(batch_size=500)
-            batch = reader.next()
-            batch = batch.as_multi_agent()
-            check(local_trainer.update(batch), runner.update(batch)[0])
-
-            new_module_id = "test_module"
-
-            add_module_to_runner_or_trainer(fw, env, new_module_id, runner)
-            add_module_to_runner_or_trainer(fw, env, new_module_id, local_trainer)
-
-            # make the state of the trainer and the local runner identical
-            local_trainer.set_state(runner.get_state()[0])
-
-            # do another update
-            batch = reader.next()
-            ma_batch = MultiAgentBatch(
-                {new_module_id: batch, DEFAULT_POLICY_ID: batch}, env_steps=batch.count
-            )
-            check(local_trainer.update(ma_batch), runner.update(ma_batch)[0])
-
-            check(local_trainer.get_state(), runner.get_state()[0])
-
-
-if __name__ == "__main__":
-    import pytest
-    import sys
-
-    sys.exit(pytest.main(["-v", __file__]))

From cf4041e52a39b8d196a559ca2cc5d6e9fad24d22 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sun, 29 Jan 2023 00:50:34 -0800
Subject: [PATCH 084/112] fixed the test failures

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD                                      |  2 +-
 .../core/rl_trainer/tests/test_trainer_runner.py | 16 +++++++++++++---
 rllib/utils/framework.py                         |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 0a740c14f7aa..432014f1dd60 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1838,7 +1838,7 @@ py_test(
 py_test(
     name = "test_trainer_runner",
     tags = ["team:rllib", "multi_gpu", "exclusive"],
-    size = "medium",
+    size = "large",
     srcs = ["core/rl_trainer/tests/test_trainer_runner.py"]
 )
 
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 104ba0827d05..af7d81304ce1 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -43,13 +43,17 @@ def test_trainer_runner_local(self):
         tf1, tf, tfv = try_import_tf()
         tf1.executing_eagerly()
 
-        fws = ["tf", "torch"]
+        # TODO (Avnish): tf does not clear out the GPU memory footprint, therefore
+        # doing it first before torch will result in OOM. Find a way to clear out the
+        # GPU memory footprint of tf.
+        fws = ["torch", "tf"]
         scaling_modes = ["local-cpu", "local-gpu"]
         test_iterator = itertools.product(fws, scaling_modes)
 
         env = gym.make("CartPole-v1")
         for fw, scaling_mode in test_iterator:
             print(f"Testing framework: {fw}, scaling mode: {scaling_mode}")
+            ray.init(ignore_reinit_error=True)
             scaling_config = self.scaling_configs[scaling_mode]
             runner = get_trainer_runner(fw, env, scaling_config)
             local_trainer = get_rl_trainer(fw, env)
@@ -80,11 +84,17 @@ def test_trainer_runner_local(self):
 
             check(local_trainer.get_state(), runner.get_state()[0])
 
+            # make sure the runner resources are freed up so that we don't autoscale
+            del runner
+            del local_trainer
+            ray.shutdown()
+            time.sleep(10)
+
     def test_update_multigpu(self):
 
         # TODO (Avnish): The tf + remote-gpu test is flakey. Removing for now until
         # investigated.
-        fws = ["torch"]
+        fws = ["torch", "tf"]
         scaling_modes = self.scaling_configs.keys()
         test_iterator = itertools.product(fws, scaling_modes)
 
@@ -127,7 +137,7 @@ def test_add_remove_module(self):
 
         # TODO (Avnish): The tf + remote-gpu test is flakey. Removing for now until
         # investigated.
-        fws = ["torch"]
+        fws = ["torch", "tf"]
         scaling_modes = self.scaling_configs.keys()
         test_iterator = itertools.product(fws, scaling_modes)
 
diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py
index aab7bba084bb..98a4d4dc3ee1 100644
--- a/rllib/utils/framework.py
+++ b/rllib/utils/framework.py
@@ -64,7 +64,7 @@ def try_import_tf(error: bool = False):
     # that uses them: del os.environ["RLLIB_TEST_NO_TF_IMPORT"]
     if "RLLIB_TEST_NO_TF_IMPORT" in os.environ:
         logger.warning("Not importing TensorFlow for test purposes")
-        return None, tf_stub, None
+        return None, None, None
 
     if "TF_CPP_MIN_LOG_LEVEL" not in os.environ:
         os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

From d4cd6540069662cb099697ba6b1a8de6c585d466 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sun, 29 Jan 2023 08:41:36 -0800
Subject: [PATCH 085/112] 1. Removed tf due to flakiness from
 test_trainer_runner 2. Moved the logic of trainerScalingConfig into the
 object instead of using a separate torch/tf specific dataclass.

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           | 30 +++------
 .../core/rl_trainer/tests/test_rl_trainer.py  | 14 +----
 .../rl_trainer/tests/test_trainer_runner.py   |  6 +-
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     | 12 ++--
 .../torch/tests/test_torch_rl_trainer.py      | 17 ++---
 .../core/rl_trainer/torch/torch_rl_trainer.py |  9 ++-
 rllib/core/rl_trainer/trainer_runner.py       | 63 +++++++------------
 .../core/rl_trainer/trainer_runner_config.py  | 14 +++--
 rllib/core/testing/utils.py                   |  5 +-
 9 files changed, 60 insertions(+), 110 deletions(-)

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index b8f1a762a336..be11dc2f51ed 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -24,7 +24,6 @@
     ModuleID,
     SingleAgentRLModuleSpec,
 )
-from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
 
 from ray.rllib.core.rl_module.marl_module import (
     MultiAgentRLModule,
@@ -35,11 +34,7 @@
 from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.params import Hyperparams
 from ray.rllib.utils.typing import TensorType
-from ray.rllib.core.rl_trainer.scaling_config import (
-    RLModuleBackendConfig,
-    TorchRLModuleBackendConfig,
-    TfRLModuleBackendConfig,
-)
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 
 if TYPE_CHECKING:
     from ray.rllib.algorithms import AlgorithmConfig
@@ -128,7 +123,7 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        module_backend_config: Optional[RLModuleBackendConfig] = None,
+        trainer_scaling_config: Optional[TrainerScalingConfig] = None,
         trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         # TODO (Kourosh): Having the entire algorithm_config inside trainer may not be
@@ -152,8 +147,7 @@ def __init__(
         self.config = trainer_hyperparameters
 
         # pick the configs that we need for the trainer from scaling config
-        module_backend_config = module_backend_config or RLModuleBackendConfig()
-        self._distributed = module_backend_config.distributed
+        self._distributed = trainer_scaling_config.num_workers > 1
 
         # These are the attributes that are set during build
         self._module: MultiAgentRLModule = None
@@ -650,7 +644,7 @@ class RLTrainerSpec:
     rl_trainer_class: Type["RLTrainer"]
     module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None
     module: Optional["RLModule"] = None
-    module_backend_config: "RLModuleBackendConfig" = None
+    trainer_scaling_config: Optional[TrainerScalingConfig] = None
     optimizer_config: Dict[str, Any] = field(default_factory=dict)
     trainer_hyperparameters: HyperparamType = field(default_factory=dict)
 
@@ -659,25 +653,15 @@ def __post_init__(self):
         if isinstance(self.trainer_hyperparameters, dict):
             self.trainer_hyperparameters = Hyperparams(self.trainer_hyperparameters)
 
-        # if module_backend_config is not set, we will create a dafault.
-        if self.module_backend_config is None:
-            if self.module is not None:
-                if isinstance(self.module, TorchRLModule):
-                    self.module_backend_config = TorchRLModuleBackendConfig()
-                else:
-                    self.module_backend_config = TfRLModuleBackendConfig()
-            elif self.module_spec is not None:
-                if issubclass(self.module_spec.module_class, TorchRLModule):
-                    self.module_backend_config = TorchRLModuleBackendConfig()
-                else:
-                    self.module_backend_config = TfRLModuleBackendConfig()
+        if self.trainer_scaling_config is None:
+            self.trainer_scaling_config = TrainerScalingConfig()
 
     def get_params_dict(self) -> Dict[str, Any]:
         """Returns the parameters than be passed to the RLTrainer constructor."""
         return {
             "module": self.module,
             "module_spec": self.module_spec,
-            "module_backend_config": self.module_backend_config,
+            "trainer_scaling_config": self.trainer_scaling_config,
             "optimizer_config": self.optimizer_config,
             "trainer_hyperparameters": self.trainer_hyperparameters,
         }
diff --git a/rllib/core/rl_trainer/tests/test_rl_trainer.py b/rllib/core/rl_trainer/tests/test_rl_trainer.py
index 472c1848b011..b33bea2a1cc6 100644
--- a/rllib/core/rl_trainer/tests/test_rl_trainer.py
+++ b/rllib/core/rl_trainer/tests/test_rl_trainer.py
@@ -11,20 +11,12 @@
 from ray.rllib.core.testing.tf.bc_rl_trainer import BCTfRLTrainer
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
-from ray.rllib.core.rl_trainer.scaling_config import TfRLModuleBackendConfig
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 
 
-def get_trainer(distributed=False) -> RLTrainer:
+def get_trainer() -> RLTrainer:
     env = gym.make("CartPole-v1")
 
-    # TODO: Another way to make RLTrainer would be to construct the module first
-    # and then apply trainer to it. We should also allow that. In fact if we figure
-    # out the serialization of RLModules we can simply pass the module the trainer
-    # and internally it will serialize and deserialize the module for distributed
-    # construction.
-    backend = TfRLModuleBackendConfig(enable_tf_function=False).set_distributed(
-        distributed
-    )
     trainer = BCTfRLTrainer(
         module_spec=SingleAgentRLModuleSpec(
             module_class=DiscreteBCTFModule,
@@ -33,7 +25,7 @@ def get_trainer(distributed=False) -> RLTrainer:
             model_config={"hidden_dim": 32},
         ),
         optimizer_config={"lr": 1e-3},
-        module_backend_config=backend,
+        trainer_scaling_config=TrainerScalingConfig(),
     )
 
     trainer.build()
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index af7d81304ce1..67a511b6d5bc 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -46,7 +46,7 @@ def test_trainer_runner_local(self):
         # TODO (Avnish): tf does not clear out the GPU memory footprint, therefore
         # doing it first before torch will result in OOM. Find a way to clear out the
         # GPU memory footprint of tf.
-        fws = ["torch", "tf"]
+        fws = ["torch"]
         scaling_modes = ["local-cpu", "local-gpu"]
         test_iterator = itertools.product(fws, scaling_modes)
 
@@ -94,7 +94,7 @@ def test_update_multigpu(self):
 
         # TODO (Avnish): The tf + remote-gpu test is flakey. Removing for now until
         # investigated.
-        fws = ["torch", "tf"]
+        fws = ["torch"]
         scaling_modes = self.scaling_configs.keys()
         test_iterator = itertools.product(fws, scaling_modes)
 
@@ -137,7 +137,7 @@ def test_add_remove_module(self):
 
         # TODO (Avnish): The tf + remote-gpu test is flakey. Removing for now until
         # investigated.
-        fws = ["torch", "tf"]
+        fws = ["torch"]
         scaling_modes = self.scaling_configs.keys()
         test_iterator = itertools.product(fws, scaling_modes)
 
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 49a2c47f480d..119daa6a7e16 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -34,7 +34,7 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
-from ray.rllib.core.rl_trainer.scaling_config import TfRLModuleBackendConfig
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 
 
 tf1, tf, tfv = try_import_tf()
@@ -97,14 +97,14 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        module_backend_config: Optional[TfRLModuleBackendConfig] = None,
+        trainer_scaling_config: Optional[TrainerScalingConfig] = None,
         trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         super().__init__(
             module_spec=module_spec,
             module=module,
             optimizer_config=optimizer_config,
-            module_backend_config=module_backend_config,
+            trainer_scaling_config=trainer_scaling_config,
             trainer_hyperparameters=trainer_hyperparameters,
         )
 
@@ -116,8 +116,10 @@ def __init__(
         # does not mention this as a requirement?
         tf1.enable_eager_execution()
 
-        module_backend_config = module_backend_config or TfRLModuleBackendConfig()
-        self._enable_tf_function = module_backend_config.enable_tf_function
+        # TODO (Kourosh): Fix this later
+        self._enable_tf_function = getattr(
+            trainer_hyperparameters, "eager_tracing", False
+        )
         if self._enable_tf_function:
             self._update_fn = tf.function(self._do_update_fn)
         else:
diff --git a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
index a9145fa744fd..9de3c1d47259 100644
--- a/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/tests/test_torch_rl_trainer.py
@@ -12,21 +12,12 @@
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader
 from ray.rllib.utils.numpy import convert_to_numpy
-from ray.rllib.core.rl_trainer.scaling_config import TorchRLModuleBackendConfig
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 
 
-def _get_trainer(distributed: bool = False) -> RLTrainer:
+def _get_trainer() -> RLTrainer:
     env = gym.make("CartPole-v1")
-    distributed = False
-
-    # TODO: Another way to make RLTrainer would be to construct the module first
-    # and then apply trainer to it. We should also allow that. In fact if we figure
-    # out the serialization of RLModules we can simply pass the module the trainer
-    # and internally it will serialize and deserialize the module for distributed
-    # construction.
-    backend = (
-        TorchRLModuleBackendConfig().set_distributed(distributed).set_use_gpu(False)
-    )
+
     trainer = BCTorchRLTrainer(
         module_spec=SingleAgentRLModuleSpec(
             module_class=DiscreteBCTorchModule,
@@ -34,8 +25,8 @@ def _get_trainer(distributed: bool = False) -> RLTrainer:
             action_space=env.action_space,
             model_config={"hidden_dim": 32},
         ),
-        module_backend_config=backend,
         optimizer_config={"lr": 1e-3},
+        trainer_scaling_config=TrainerScalingConfig(),
     )
 
     trainer.build()
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 2054c22348f2..a4544f63ecc4 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -28,7 +28,7 @@
     HyperparamType,
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
-from ray.rllib.core.rl_trainer.scaling_config import TorchRLModuleBackendConfig
+from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
@@ -56,20 +56,19 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        module_backend_config: Optional[TorchRLModuleBackendConfig] = None,
+        trainer_scaling_config: Optional[TrainerScalingConfig] = None,
         trainer_hyperparameters: Optional[HyperparamType] = None,
     ):
         super().__init__(
             module_spec=module_spec,
             module=module,
             optimizer_config=optimizer_config,
-            module_backend_config=module_backend_config,
+            trainer_scaling_config=trainer_scaling_config,
             trainer_hyperparameters=trainer_hyperparameters,
         )
 
         # pick the stuff that we need from the scaling config
-        module_backend_config = module_backend_config or TorchRLModuleBackendConfig()
-        self._use_gpu = module_backend_config.use_gpu
+        self._use_gpu = trainer_scaling_config.num_gpus_per_worker > 0
 
         # These attributes are set in the `TorchRLModuleBackendConfig
         self._device = None
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 1a4c5a990f9b..b6889c77d973 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -1,13 +1,5 @@
 import math
-from typing import (
-    Any,
-    List,
-    Mapping,
-    Type,
-    Optional,
-    Callable,
-    Dict,
-)
+from typing import Any, List, Mapping, Type, Optional, Callable, Dict, TYPE_CHECKING
 
 import ray
 
@@ -21,10 +13,27 @@
     ParamOptimizerPairs,
     Optimizer,
 )
-from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.train._internal.backend_executor import BackendExecutor
 
+if TYPE_CHECKING:
+    from ray.rllib.core.rl_trainer.rl_trainer import RLTrainer
+
+
+def _get_backend_config(rl_trainer_class: Type["RLTrainer"]) -> str:
+    if rl_trainer_class.framework == "torch":
+        from ray.train.torch import TorchConfig
+
+        backend_config = TorchConfig()
+    elif rl_trainer_class.framework == "tf":
+        from ray.train.tensorflow import TensorflowConfig
+
+        backend_config = TensorflowConfig()
+    else:
+        raise ValueError("framework must be either torch or tf")
+
+    return backend_config
+
 
 class TrainerRunner:
     """Coordinator of RLTrainers.
@@ -53,19 +62,10 @@ class TrainerRunner:
     def __init__(
         self,
         rl_trainer_spec: RLTrainerSpec,
-        scaling_config: Optional[TrainerScalingConfig] = None,
     ):
-        scaling_config = scaling_config or TrainerScalingConfig()
+        scaling_config = rl_trainer_spec.trainer_scaling_config
         rl_trainer_class = rl_trainer_spec.rl_trainer_class
 
-        # setup wether the worker should use gpu or not
-        if rl_trainer_class.framework == "torch":
-            trainer_should_use_gpu = scaling_config.num_gpus_per_worker > 0
-            rl_trainer_spec.module_backend_config.set_use_gpu(trainer_should_use_gpu)
-        else:
-            # TODO (Avnish) How do I run TF on one GPU?
-            pass
-
         # TODO (Kourosh): Go with a _remote flag instead of _is_local to be more
         # explicit
         self._is_local = scaling_config.num_workers == 0
@@ -73,31 +73,10 @@ def __init__(
         self._workers = None
 
         if self._is_local:
-            # in local mode the trainer is always not distributed
-            rl_trainer_spec.module_backend_config.set_distributed(False)
             self._trainer = rl_trainer_class(**rl_trainer_spec.get_params_dict())
             self._trainer.build()
         else:
-            # in remote mode the trainer is distributed only if there are more than 1
-            # workers
-            is_trainer_distributed = scaling_config.num_workers > 1
-            (
-                rl_trainer_spec.module_backend_config.set_distributed(
-                    is_trainer_distributed
-                )
-            )
-
-            if rl_trainer_class.framework == "torch":
-                from ray.train.torch import TorchConfig
-
-                backend_config = TorchConfig()
-            elif rl_trainer_class.framework == "tf":
-                from ray.train.tensorflow import TensorflowConfig
-
-                backend_config = TensorflowConfig()
-            else:
-                raise ValueError("framework must be either torch or tf")
-
+            backend_config = _get_backend_config(rl_trainer_class)
             backend_executor = BackendExecutor(
                 backend_config=backend_config,
                 num_workers=scaling_config.num_workers,
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index 4a89496ac4e2..41b9e9f755cd 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -69,18 +69,20 @@ def validate(self) -> None:
     def build(self) -> TrainerRunner:
         self.validate()
 
+        scaling_config = TrainerScalingConfig(
+            num_workers=self.num_trainer_workers,
+            num_gpus_per_worker=self.num_gpus_per_trainer_worker,
+            num_cpus_per_worker=self.num_cpus_per_trainer_worker,
+        )
         rl_trainer_spec = RLTrainerSpec(
             rl_trainer_class=self.trainer_class,
             module_spec=self.module_spec,
             optimizer_config=self.optimizer_config,
+            trainer_scaling_config=scaling_config,
             trainer_hyperparameters=self.algorithm_config,
         )
-        scaling_config = TrainerScalingConfig(
-            num_workers=self.num_trainer_workers,
-            num_gpus_per_worker=self.num_gpus_per_trainer_worker,
-            num_cpus_per_worker=self.num_cpus_per_trainer_worker,
-        )
-        return self.trainer_runner_class(rl_trainer_spec, scaling_config)
+
+        return self.trainer_runner_class(rl_trainer_spec)
 
     def algorithm(
         self, algorithm_config: Optional["AlgorithmConfig"] = NotProvided
diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py
index 7333c60e8077..01e38f07ad8a 100644
--- a/rllib/core/testing/utils.py
+++ b/rllib/core/testing/utils.py
@@ -103,7 +103,7 @@ def get_rl_trainer(
 def get_trainer_runner(
     framework: str,
     env: "gym.Env",
-    trainer_runner_scaling_config: TrainerScalingConfig,
+    scaling_config: TrainerScalingConfig,
     is_multi_agent: bool = False,
 ) -> TrainerRunner:
 
@@ -113,8 +113,9 @@ def get_trainer_runner(
             framework=framework, env=env, is_multi_agent=is_multi_agent
         ),
         optimizer_config={"lr": 0.1},
+        trainer_scaling_config=scaling_config,
     )
-    runner = TrainerRunner(rl_trainer_spec, trainer_runner_scaling_config)
+    runner = TrainerRunner(rl_trainer_spec)
 
     return runner
 

From 54eb315d398283f80381989f08315c6c740f064e Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sun, 29 Jan 2023 08:44:55 -0800
Subject: [PATCH 086/112] removed backend class definitions

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/scaling_config.py       | 84 -------------------
 .../core/rl_trainer/torch/torch_rl_trainer.py |  1 -
 2 files changed, 85 deletions(-)

diff --git a/rllib/core/rl_trainer/scaling_config.py b/rllib/core/rl_trainer/scaling_config.py
index cc892222b316..1c8054ef4468 100644
--- a/rllib/core/rl_trainer/scaling_config.py
+++ b/rllib/core/rl_trainer/scaling_config.py
@@ -1,90 +1,6 @@
 from dataclasses import dataclass
 
 
-@dataclass
-class RLModuleBackendConfig:
-    """Base class for scaling config relevant to RLTrainer.
-
-    Attributes:
-        distributed: If True, the rl_trainer will be instantiated in distributed mode.
-
-    Methods:
-        set_distributed: Set the distributed flag. _distibuted attribute should not be
-            set to True at the time of constructing the config. The caller should
-            explicitly decide whether the rl_trainer should be instiantiated in
-            distributed mode or not.
-    """
-
-    def __post_init__(self):
-        self._distributed: bool = False
-
-    @property
-    def distributed(self) -> bool:
-        return self._distributed
-
-    def set_distributed(self, distributed: bool) -> "RLModuleBackendConfig":
-        """Set the distributed flag.
-
-        _distibuted attribute should not be set directly at the time of constuction,
-        the caller should explicitly decide whether the rl_trainer should be
-        instiantiated in distributed mode or not.
-
-        Args:
-            distributed: If True, the rl trainer will be instantiated in distributed
-                mode.
-        """
-        self._distributed = distributed
-        return self
-
-
-@dataclass
-class TorchRLModuleBackendConfig(RLModuleBackendConfig):
-    """Torch-specific scaling config relevant to TorchRLTrainer.
-
-    Attributes:
-        use_gpu: If True, the torch rl_trainer will be setup to use the gpu.
-
-    Methods:
-        set_use_gpu: Set the use_gpu flag. _use_gpu attribute should not be set to True
-            at the time of constructing the config. The caller should explicitly decide
-            whether the torch rl_trainer should be using gpu or not.
-    """
-
-    def __post_init__(self):
-        super().__post_init__()
-        self._use_gpu: bool = False
-
-    @property
-    def use_gpu(self) -> bool:
-        return self._use_gpu
-
-    def set_use_gpu(self, use_gpu: bool) -> "TorchRLModuleBackendConfig":
-        """Set the use_gpu flag.
-
-        _use_gpu attribute should not be set directly at the time of constuction,
-        the caller should explicitly decide whether the torch rl_trainer should be
-        using gpu or not
-
-        Args:
-            use_gpu: If True, the rl trainer will be setup to use the gpu.
-        """
-        self._use_gpu = use_gpu
-        return self
-
-
-@dataclass
-class TfRLModuleBackendConfig(RLModuleBackendConfig):
-    """Tf-specific scaling config relevant to TFRLTrainer.
-
-    Args:
-        enable_tf_function: If True, the tf.function decorator will be used to
-            decorate the train_step function. This is recommended to boost performance
-            via tracing the graph.
-    """
-
-    enable_tf_function: bool = True
-
-
 @dataclass
 class TrainerScalingConfig:
     """Configuratiom for scaling training actors.
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index a4544f63ecc4..adabc42d1064 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -70,7 +70,6 @@ def __init__(
         # pick the stuff that we need from the scaling config
         self._use_gpu = trainer_scaling_config.num_gpus_per_worker > 0
 
-        # These attributes are set in the `TorchRLModuleBackendConfig
         self._device = None
 
     @property

From 1c826db9bc454cd59b4b424dc5be1190465d119c Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sun, 29 Jan 2023 09:01:32 -0800
Subject: [PATCH 087/112] Removed Hyperparams class

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/rl_trainer.py           | 42 +++++++++++--------
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     |  8 ++--
 .../core/rl_trainer/torch/torch_rl_trainer.py |  4 +-
 rllib/utils/params.py                         | 12 ------
 4 files changed, 29 insertions(+), 37 deletions(-)
 delete mode 100644 rllib/utils/params.py

diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index be11dc2f51ed..fcbdd2145936 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -15,7 +15,6 @@
     Tuple,
     Type,
     Union,
-    TYPE_CHECKING,
 )
 
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
@@ -32,13 +31,9 @@
 from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.numpy import convert_to_numpy
-from ray.rllib.utils.params import Hyperparams
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 
-if TYPE_CHECKING:
-    from ray.rllib.algorithms import AlgorithmConfig
-
 
 torch, _ = try_import_torch()
 tf1, tf, tfv = try_import_tf()
@@ -50,7 +45,24 @@
 ParamOptimizerPairs = List[Tuple[Sequence[ParamType], Optimizer]]
 ParamRef = Hashable
 ParamDictType = Dict[ParamRef, ParamType]
-HyperparamType = Union["AlgorithmConfig", Hyperparams]
+
+
+@dataclass
+class RLTrainerHPs:
+    """The hyper-parameters for RLTrainer.
+
+    When creating a new RLTrainer, the new hyper-parameters have to be defined by
+    subclassing this class and adding the new hyper-parameters as fields.
+
+    Args:
+        eager_tracing: Whether to trace the model in eager mode. This enables tf
+            tracing mode by wrapping the loss function computation in a tf.function.
+            This is useful for speeding up the training loop. However, it is not
+            compatible with all tf operations. For example, tf.print is not supported
+            in tf.function.
+    """
+
+    eager_tracing: bool = False
 
 
 class RLTrainer:
@@ -124,7 +136,7 @@ def __init__(
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
         trainer_scaling_config: Optional[TrainerScalingConfig] = None,
-        trainer_hyperparameters: Optional[HyperparamType] = None,
+        trainer_hyperparameters: Optional[RLTrainerHPs] = None,
     ):
         # TODO (Kourosh): Having the entire algorithm_config inside trainer may not be
         # the best idea in the world, but it's easy to implement and user will
@@ -633,12 +645,10 @@ class RLTrainerSpec:
             only works if the RLTrainer is not an actor.
         backend_config: The backend config for properly distributing the RLModule.
         optimizer_config: The optimizer setting to apply during training.
-        trainer_hyperparameters: The extra config for the loss/additional update. The
-            items within this object should be accessible via a dot notation. For
-            example, if the trainer_hyperparameters contains {"coeff": 0.001}, then the
-            learning rate can be accessed via trainer_hyperparameters.coeff. This is
-            useful for passing in algorithm config or a HyperParams that contains the
-            hyper-parameters.
+        trainer_hyperparameters: The extra config for the loss/additional update. This
+            should be a subclass of RLTrainerHPs. This is useful for passing in
+            algorithm configs that contains the hyper-parameters for loss computation,
+            change of training behaviors, etc. e.g lr, entropy_coeff.
     """
 
     rl_trainer_class: Type["RLTrainer"]
@@ -646,13 +656,9 @@ class RLTrainerSpec:
     module: Optional["RLModule"] = None
     trainer_scaling_config: Optional[TrainerScalingConfig] = None
     optimizer_config: Dict[str, Any] = field(default_factory=dict)
-    trainer_hyperparameters: HyperparamType = field(default_factory=dict)
+    trainer_hyperparameters: RLTrainerHPs = field(default_factory=RLTrainerHPs)
 
     def __post_init__(self):
-        # convert to hyper params object if needed
-        if isinstance(self.trainer_hyperparameters, dict):
-            self.trainer_hyperparameters = Hyperparams(self.trainer_hyperparameters)
-
         if self.trainer_scaling_config is None:
             self.trainer_scaling_config = TrainerScalingConfig()
 
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 119daa6a7e16..0ec49ac07095 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -12,13 +12,13 @@
 )
 
 from ray.rllib.core.rl_trainer.rl_trainer import (
+    RLTrainerHPs,
     RLTrainer,
     ParamOptimizerPairs,
     ParamRef,
     Optimizer,
     ParamType,
     ParamDictType,
-    HyperparamType,
 )
 from ray.rllib.core.rl_module.rl_module import (
     RLModule,
@@ -98,7 +98,7 @@ def __init__(
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
         trainer_scaling_config: Optional[TrainerScalingConfig] = None,
-        trainer_hyperparameters: Optional[HyperparamType] = None,
+        trainer_hyperparameters: Optional[RLTrainerHPs] = None,
     ):
         super().__init__(
             module_spec=module_spec,
@@ -117,9 +117,7 @@ def __init__(
         tf1.enable_eager_execution()
 
         # TODO (Kourosh): Fix this later
-        self._enable_tf_function = getattr(
-            trainer_hyperparameters, "eager_tracing", False
-        )
+        self._enable_tf_function = self.config.eager_tracing
         if self._enable_tf_function:
             self._update_fn = tf.function(self._do_update_fn)
         else:
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index adabc42d1064..da7b9aa25f32 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -20,12 +20,12 @@
     MultiAgentRLModuleSpec,
 )
 from ray.rllib.core.rl_trainer.rl_trainer import (
+    RLTrainerHPs,
     RLTrainer,
     ParamOptimizerPairs,
     Optimizer,
     ParamType,
     ParamDictType,
-    HyperparamType,
 )
 from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule
 from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
@@ -57,7 +57,7 @@ def __init__(
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
         trainer_scaling_config: Optional[TrainerScalingConfig] = None,
-        trainer_hyperparameters: Optional[HyperparamType] = None,
+        trainer_hyperparameters: Optional[RLTrainerHPs] = None,
     ):
         super().__init__(
             module_spec=module_spec,
diff --git a/rllib/utils/params.py b/rllib/utils/params.py
deleted file mode 100644
index 4574d4bdd61f..000000000000
--- a/rllib/utils/params.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from ray.rllib.utils.annotations import ExperimentalAPI
-
-
-@ExperimentalAPI
-class Hyperparams(dict):
-    """This is an extention of the dict class that allows access via `.` notation."""
-
-    def __getattr__(self, key):
-        if key in self:
-            return self[key]
-        else:
-            return super().__getattr__(key)

From e8cf7e19a6e9a9dd2b0085bd1cf1484c6b94188d Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sun, 29 Jan 2023 09:44:31 -0800
Subject: [PATCH 088/112] introed FrameworkHPs to differebntiate between
 tf/torch specific stuff vs. algorithm specific stuff

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm_config.py          |  9 +++++
 rllib/core/rl_trainer/rl_trainer.py           | 29 ++++++++++------
 rllib/core/rl_trainer/tf/tf_rl_trainer.py     | 28 ++++-----------
 .../core/rl_trainer/torch/torch_rl_trainer.py | 24 ++-----------
 .../core/rl_trainer/trainer_runner_config.py  | 34 ++++++++++++-------
 5 files changed, 58 insertions(+), 66 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 8552f9a39bbd..4978a2819633 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -17,6 +17,7 @@
 import ray
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
+from ray.rllib.core.rl_trainer.rl_trainer import RLTrainerHPs
 from ray.rllib.core.rl_trainer.trainer_runner_config import (
     TrainerRunnerConfig,
     ModuleSpec,
@@ -320,6 +321,10 @@ def __init__(self, algo_class=None):
         self.max_requests_in_flight_per_sampler_worker = 2
         self.rl_trainer_class = None
         self._enable_rl_trainer_api = False
+        # experimental: this will contain the hyper-parameters that are passed to the
+        # RLTrainer, for computing loss, etc. New algorithms have to set this to their
+        # own default. .training() will modify the fields of this object.
+        self._rl_trainer_hps = RLTrainerHPs()
 
         # `self.callbacks()`
         self.callbacks_class = DefaultCallbacks
@@ -445,6 +450,10 @@ def __init__(self, algo_class=None):
         self.soft_horizon = DEPRECATED_VALUE
         self.no_done_at_end = DEPRECATED_VALUE
 
+    @property
+    def rl_trainer_hps(self) -> RLTrainerHPs:
+        return self._rl_trainer_hps
+
     def to_dict(self) -> AlgorithmConfigDict:
         """Converts all settings into a legacy config dict for backward compatibility.
 
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index fcbdd2145936..eb8b5f052b6d 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -48,11 +48,8 @@
 
 
 @dataclass
-class RLTrainerHPs:
-    """The hyper-parameters for RLTrainer.
-
-    When creating a new RLTrainer, the new hyper-parameters have to be defined by
-    subclassing this class and adding the new hyper-parameters as fields.
+class FrameworkHPs:
+    """The framework specific hyper-parameters.
 
     Args:
         eager_tracing: Whether to trace the model in eager mode. This enables tf
@@ -65,6 +62,17 @@ class RLTrainerHPs:
     eager_tracing: bool = False
 
 
+@dataclass
+class RLTrainerHPs:
+    """The hyper-parameters for RLTrainer.
+
+    When creating a new RLTrainer, the new hyper-parameters have to be defined by
+    subclassing this class and adding the new hyper-parameters as fields.
+    """
+
+    pass
+
+
 class RLTrainer:
     """Base class for RLlib algorithm trainers.
 
@@ -137,6 +145,7 @@ def __init__(
         optimizer_config: Mapping[str, Any] = None,
         trainer_scaling_config: Optional[TrainerScalingConfig] = None,
         trainer_hyperparameters: Optional[RLTrainerHPs] = None,
+        framework_hyperparameters: Optional[FrameworkHPs] = None,
     ):
         # TODO (Kourosh): Having the entire algorithm_config inside trainer may not be
         # the best idea in the world, but it's easy to implement and user will
@@ -654,13 +663,12 @@ class RLTrainerSpec:
     rl_trainer_class: Type["RLTrainer"]
     module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None
     module: Optional["RLModule"] = None
-    trainer_scaling_config: Optional[TrainerScalingConfig] = None
+    trainer_scaling_config: TrainerScalingConfig = field(
+        default_factory=TrainerScalingConfig
+    )
     optimizer_config: Dict[str, Any] = field(default_factory=dict)
     trainer_hyperparameters: RLTrainerHPs = field(default_factory=RLTrainerHPs)
-
-    def __post_init__(self):
-        if self.trainer_scaling_config is None:
-            self.trainer_scaling_config = TrainerScalingConfig()
+    framework_hyperparameters: FrameworkHPs = field(default_factory=FrameworkHPs)
 
     def get_params_dict(self) -> Dict[str, Any]:
         """Returns the parameters than be passed to the RLTrainer constructor."""
@@ -670,6 +678,7 @@ def get_params_dict(self) -> Dict[str, Any]:
             "trainer_scaling_config": self.trainer_scaling_config,
             "optimizer_config": self.optimizer_config,
             "trainer_hyperparameters": self.trainer_hyperparameters,
+            "framework_hyperparameters": self.framework_hyperparameters,
         }
 
     def build(self) -> "RLTrainer":
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 0ec49ac07095..8f7048a633ae 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -12,7 +12,7 @@
 )
 
 from ray.rllib.core.rl_trainer.rl_trainer import (
-    RLTrainerHPs,
+    FrameworkHPs,
     RLTrainer,
     ParamOptimizerPairs,
     ParamRef,
@@ -25,16 +25,12 @@
     ModuleID,
     SingleAgentRLModuleSpec,
 )
-from ray.rllib.core.rl_module.marl_module import (
-    MultiAgentRLModule,
-    MultiAgentRLModuleSpec,
-)
+from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
-from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
 
 
 tf1, tf, tfv = try_import_tf()
@@ -92,21 +88,10 @@ class TfRLTrainer(RLTrainer):
     def __init__(
         self,
         *,
-        module_spec: Optional[
-            Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
-        ] = None,
-        module: Optional[RLModule] = None,
-        optimizer_config: Mapping[str, Any] = None,
-        trainer_scaling_config: Optional[TrainerScalingConfig] = None,
-        trainer_hyperparameters: Optional[RLTrainerHPs] = None,
+        framework_hyperparameters: Optional[FrameworkHPs] = None,
+        **kwargs,
     ):
-        super().__init__(
-            module_spec=module_spec,
-            module=module,
-            optimizer_config=optimizer_config,
-            trainer_scaling_config=trainer_scaling_config,
-            trainer_hyperparameters=trainer_hyperparameters,
-        )
+        super().__init__(framework_hyperparameters=framework_hyperparameters, **kwargs)
 
         # TODO (Kourosh): This is required to make sure tf computes the values in the
         # end. Two question remains:
@@ -116,8 +101,7 @@ def __init__(
         # does not mention this as a requirement?
         tf1.enable_eager_execution()
 
-        # TODO (Kourosh): Fix this later
-        self._enable_tf_function = self.config.eager_tracing
+        self._enable_tf_function = framework_hyperparameters.eager_tracing
         if self._enable_tf_function:
             self._update_fn = tf.function(self._do_update_fn)
         else:
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index da7b9aa25f32..393bedd579b8 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -15,12 +15,8 @@
     ModuleID,
     SingleAgentRLModuleSpec,
 )
-from ray.rllib.core.rl_module.marl_module import (
-    MultiAgentRLModule,
-    MultiAgentRLModuleSpec,
-)
+from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule
 from ray.rllib.core.rl_trainer.rl_trainer import (
-    RLTrainerHPs,
     RLTrainer,
     ParamOptimizerPairs,
     Optimizer,
@@ -49,23 +45,9 @@ class TorchRLTrainer(RLTrainer):
     framework: str = "torch"
 
     def __init__(
-        self,
-        *,
-        module_spec: Optional[
-            Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
-        ] = None,
-        module: Optional[RLModule] = None,
-        optimizer_config: Mapping[str, Any] = None,
-        trainer_scaling_config: Optional[TrainerScalingConfig] = None,
-        trainer_hyperparameters: Optional[RLTrainerHPs] = None,
+        self, *, trainer_scaling_config: Optional[TrainerScalingConfig] = None, **kwargs
     ):
-        super().__init__(
-            module_spec=module_spec,
-            module=module,
-            optimizer_config=optimizer_config,
-            trainer_scaling_config=trainer_scaling_config,
-            trainer_hyperparameters=trainer_hyperparameters,
-        )
+        super().__init__(trainer_scaling_config=trainer_scaling_config, **kwargs)
 
         # pick the stuff that we need from the scaling config
         self._use_gpu = trainer_scaling_config.num_gpus_per_worker > 0
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index 41b9e9f755cd..4332c315b162 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -4,12 +4,15 @@
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.core.rl_trainer.trainer_runner import TrainerRunner
 from ray.rllib.core.rl_trainer.scaling_config import TrainerScalingConfig
-from ray.rllib.core.rl_trainer.rl_trainer import RLTrainerSpec
+from ray.rllib.core.rl_trainer.rl_trainer import (
+    RLTrainerSpec,
+    RLTrainerHPs,
+    FrameworkHPs,
+)
 from ray.rllib.utils.from_config import NotProvided
 
 
 if TYPE_CHECKING:
-    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
     from ray.rllib.core.rl_trainer import RLTrainer
 
 ModuleSpec = Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec]
@@ -30,16 +33,16 @@ def __init__(self, cls: Type[TrainerRunner] = None) -> None:
 
         # `self.trainer()`
         self.trainer_class = None
-        self.eager_tracing = True
         self.optimizer_config = None
+        self.rl_trainer_hps = RLTrainerHPs()
 
         # `self.resources()`
         self.num_gpus_per_trainer_worker = 0
         self.num_cpus_per_trainer_worker = 1
         self.num_trainer_workers = 1
 
-        # `self.algorithm()`
-        self.algorithm_config = None
+        # `self.framework()`
+        self.eager_tracing = False
 
     def validate(self) -> None:
 
@@ -74,21 +77,26 @@ def build(self) -> TrainerRunner:
             num_gpus_per_worker=self.num_gpus_per_trainer_worker,
             num_cpus_per_worker=self.num_cpus_per_trainer_worker,
         )
+
+        framework_hps = FrameworkHPs(eager_tracing=self.eager_tracing)
+
         rl_trainer_spec = RLTrainerSpec(
             rl_trainer_class=self.trainer_class,
             module_spec=self.module_spec,
             optimizer_config=self.optimizer_config,
             trainer_scaling_config=scaling_config,
-            trainer_hyperparameters=self.algorithm_config,
+            trainer_hyperparameters=self.rl_trainer_hps,
+            framework_hyperparameters=framework_hps,
         )
 
         return self.trainer_runner_class(rl_trainer_spec)
 
-    def algorithm(
-        self, algorithm_config: Optional["AlgorithmConfig"] = NotProvided
+    def framework(
+        self, eager_tracing: Optional[bool] = NotProvided
     ) -> "TrainerRunnerConfig":
-        if algorithm_config is not NotProvided:
-            self.algorithm_config = algorithm_config
+
+        if eager_tracing is not NotProvided:
+            self.eager_tracing = eager_tracing
         return self
 
     def module(
@@ -121,15 +129,15 @@ def trainer(
         self,
         *,
         trainer_class: Optional[Type["RLTrainer"]] = NotProvided,
-        eager_tracing: Optional[bool] = NotProvided,
         optimizer_config: Optional[Dict] = NotProvided,
+        rl_trainer_hps: Optional[RLTrainerHPs] = NotProvided,
     ) -> "TrainerRunnerConfig":
 
         if trainer_class is not NotProvided:
             self.trainer_class = trainer_class
-        if eager_tracing is not NotProvided:
-            self.eager_tracing = eager_tracing
         if optimizer_config is not NotProvided:
             self.optimizer_config = optimizer_config
+        if rl_trainer_hps is not NotProvided:
+            self.rl_trainer_hps = rl_trainer_hps
 
         return self

From 2ea3a4d56f9310173e193d1fd3d18b6957c4c8fc Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sun, 29 Jan 2023 10:09:22 -0800
Subject: [PATCH 089/112] the unittests pass

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm_config.py                      | 4 ++--
 rllib/core/rl_trainer/rl_trainer.py                       | 6 +++---
 rllib/core/rl_trainer/tests/test_trainer_runner_config.py | 1 -
 rllib/core/rl_trainer/tf/tf_rl_trainer.py                 | 2 +-
 rllib/core/rl_trainer/torch/torch_rl_trainer.py           | 5 ++++-
 rllib/core/rl_trainer/trainer_runner_config.py            | 6 ------
 6 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 4978a2819633..7605f73a07b4 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -2669,16 +2669,16 @@ def get_trainer_runner_config(
             .module(module_spec)
             .trainer(
                 trainer_class=self.rl_trainer_class,
-                eager_tracing=self.eager_tracing,
                 # TODO (Kourosh): optimizer config can now be more complicated.
                 optimizer_config={"lr": self.lr},
+                rl_trainer_hps=self.rl_trainer_hps,
             )
             .resources(
                 num_trainer_workers=self.num_trainer_workers,
                 num_cpus_per_trainer_worker=self.num_cpus_per_trainer_worker,
                 num_gpus_per_trainer_worker=self.num_gpus_per_trainer_worker,
             )
-            .algorithm(algorithm_config=self)
+            .framework(eager_tracing=self.eager_tracing)
         )
 
         return config
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index eb8b5f052b6d..ca78f7a096f7 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -143,9 +143,9 @@ def __init__(
         ] = None,
         module: Optional[RLModule] = None,
         optimizer_config: Mapping[str, Any] = None,
-        trainer_scaling_config: Optional[TrainerScalingConfig] = None,
-        trainer_hyperparameters: Optional[RLTrainerHPs] = None,
-        framework_hyperparameters: Optional[FrameworkHPs] = None,
+        trainer_scaling_config: TrainerScalingConfig = TrainerScalingConfig(),
+        trainer_hyperparameters: Optional[RLTrainerHPs] = RLTrainerHPs(),
+        framework_hyperparameters: Optional[FrameworkHPs] = FrameworkHPs(),
     ):
         # TODO (Kourosh): Having the entire algorithm_config inside trainer may not be
         # the best idea in the world, but it's easy to implement and user will
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner_config.py b/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
index acdc67731337..46e215acd86d 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner_config.py
@@ -31,7 +31,6 @@ def test_trainer_runner_build(self):
             .trainer(
                 trainer_class=BCTfRLTrainer,
             )
-            .algorithm(algorithm_config=AlgorithmConfig())
         )
         config.build()
 
diff --git a/rllib/core/rl_trainer/tf/tf_rl_trainer.py b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
index 8f7048a633ae..0455a0e772e0 100644
--- a/rllib/core/rl_trainer/tf/tf_rl_trainer.py
+++ b/rllib/core/rl_trainer/tf/tf_rl_trainer.py
@@ -88,7 +88,7 @@ class TfRLTrainer(RLTrainer):
     def __init__(
         self,
         *,
-        framework_hyperparameters: Optional[FrameworkHPs] = None,
+        framework_hyperparameters: Optional[FrameworkHPs] = FrameworkHPs(),
         **kwargs,
     ):
         super().__init__(framework_hyperparameters=framework_hyperparameters, **kwargs)
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 393bedd579b8..8fe28c662718 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -45,7 +45,10 @@ class TorchRLTrainer(RLTrainer):
     framework: str = "torch"
 
     def __init__(
-        self, *, trainer_scaling_config: Optional[TrainerScalingConfig] = None, **kwargs
+        self,
+        *,
+        trainer_scaling_config: TrainerScalingConfig = TrainerScalingConfig(),
+        **kwargs,
     ):
         super().__init__(trainer_scaling_config=trainer_scaling_config, **kwargs)
 
diff --git a/rllib/core/rl_trainer/trainer_runner_config.py b/rllib/core/rl_trainer/trainer_runner_config.py
index 4332c315b162..d8de08c05328 100644
--- a/rllib/core/rl_trainer/trainer_runner_config.py
+++ b/rllib/core/rl_trainer/trainer_runner_config.py
@@ -58,12 +58,6 @@ def validate(self) -> None:
                 "the RLTrainer class with .trainer(trainer_class=MyTrainerClass)."
             )
 
-        if self.algorithm_config is None:
-            raise ValueError(
-                "Must provide algorithm_config for RLTrainer. Use "
-                ".algorithm(algorithm_config=MyConfig)."
-            )
-
         if self.optimizer_config is None:
             # get the default optimizer config if it's not provided
             # TODO (Kourosh): Change the optimizer config to a dataclass object.

From fd84f7ac0889a7a44d093833441c6e655ba01465 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Sun, 29 Jan 2023 23:04:51 -0800
Subject: [PATCH 090/112] addressed comments and fixed some introduced bug

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py                 |  4 ++--
 rllib/algorithms/ppo/ppo.py                   | 16 ++++++++++++----
 .../ppo/torch/ppo_torch_rl_trainer.py         | 19 ++++++++++++++++++-
 rllib/core/rl_trainer/rl_trainer.py           |  2 ++
 .../core/rl_trainer/torch/torch_rl_trainer.py |  4 ++--
 rllib/core/rl_trainer/trainer_runner.py       | 17 +++++++++--------
 rllib/evaluation/worker_set.py                | 17 +++++++++--------
 rllib/policy/policy.py                        |  2 +-
 8 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index f69e58220fd2..e818dfd1c162 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -695,7 +695,7 @@ def setup(self, config: AlgorithmConfig) -> None:
             # TODO (Kourosh): This is an interim solution where policies and modules
             # co-exist. In this world we have both policy_map and MARLModule that need
             # to be consistent with one another. To make a consistent parity between
-            # the two we need to loop throught the policy modules and create a simple
+            # the two we need to loop through the policy modules and create a simple
             # MARLModule from the RLModule within each policy.
             local_worker = self.workers.local_worker()
             module_specs = {}
@@ -715,7 +715,7 @@ def setup(self, config: AlgorithmConfig) -> None:
             trainer_runner_config = self.config.get_trainer_runner_config(module_spec)
             self.trainer_runner = trainer_runner_config.build()
 
-            # sync the weights from rollout workers to trainers
+            # sync the weights from local rollout worker to trainers
             weights = local_worker.get_weights()
             self.trainer_runner.set_weights(weights)
 
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 2f481e2c2e44..eaf4b4f9cab9 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -382,7 +382,7 @@ def training_step(self) -> ResultDict:
             # TODO (Kourosh) Clearly define what train_batch_size
             # vs. sgd_minibatch_size and num_sgd_iter is in the config.
             # TODO (Kourosh) Do this inside the RL Trainer so
-            # that we don't have to this back and forth
+            # that we don't have to do this back and forth
             # communication between driver and the remote
             # trainer workers
 
@@ -398,6 +398,11 @@ def training_step(self) -> ResultDict:
             train_results = multi_gpu_train_one_step(self, train_batch)
 
         if self.config._enable_rl_trainer_api:
+            # the train results's loss keys are pids to their loss values. But we also
+            # return a total_loss key at the same level as the pid keys. So we need to
+            # subtract that to get the total set of pids to update.
+            # TODO (Kourosh): We need to make a better design for the hierarchy of the
+            # train results, so that all the policy ids end up in the same level.
             policies_to_update = set(train_results["loss"].keys()) - {"total_loss"}
         else:
             policies_to_update = list(train_results.keys())
@@ -416,12 +421,12 @@ def training_step(self) -> ResultDict:
         # workers.
         with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
             if self.workers.num_remote_workers() > 0:
-                from_worker = None
+                from_worker_or_trainer = None
                 if self.config._enable_rl_trainer_api:
                     # sync weights from trainer_runner to all rollout workers
-                    from_worker = self.trainer_runner
+                    from_worker_or_trainer = self.trainer_runner
                 self.workers.sync_weights(
-                    from_worker=from_worker,
+                    from_worker=from_worker_or_trainer,
                     policies=policies_to_update,
                     global_vars=global_vars,
                 )
@@ -431,6 +436,9 @@ def training_step(self) -> ResultDict:
 
         if self.config._enable_rl_trainer_api:
             kl_dict = {
+                # TODO (Kourosh): Train results don't match the old format. The thing
+                # that used to be under `kl` is now under `mean_kl_loss`. Fix this. Do
+                # we need get here?
                 pid: train_results["loss"][pid].get("mean_kl_loss")
                 for pid in policies_to_update
             }
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index adc5dbd9b932..ab65abf22eb8 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -8,6 +8,7 @@
 from ray.rllib.utils.torch_utils import (
     explained_variance,
 )
+from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TensorType
 
 torch, nn = try_import_torch()
@@ -16,6 +17,13 @@
 
 
 class PPOTorchRLTrainer(TorchRLTrainer):
+    """Implements PPO loss / update logic on top of TorchRLTrainer.
+
+    This class implements the ppo loss under `_compute_loss_per_module()` and the
+    additional non-gradient based updates such as KL-coeff and learning rate updates
+    under `_additional_update_per_module()`.
+    """
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -37,6 +45,7 @@ def __init__(self, *args, **kwargs):
         self.kl_coeff = self.config.kl_coeff
         self.kl_target = self.config.kl_target
 
+    @override(TorchRLTrainer)
     def _compute_loss_per_module(
         self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType]
     ) -> TensorType:
@@ -62,7 +71,14 @@ def _compute_loss_per_module(
             mean_kl_loss = torch.mean(action_kl)
             if mean_kl_loss.isinf():
                 logger.warning(
-                    "KL divergence is non-finite, this will likely destabilize your model and the training process. Action(s) in a specific state have near-zero probability. This can happen naturally in deterministic environments where the optimal policy has zero mass for a specific action. To fix this issue, consider setting the coefficient for the KL loss term to zero or increasing policy entropy."
+                    "KL divergence is non-finite, this will likely destabilize "
+                    "your model and the training process. Action(s) in a "
+                    "specific state have near-zero probability. "
+                    "This can happen naturally in deterministic "
+                    "environments where the optimal policy has zero mass "
+                    "for a specific action. To fix this issue, consider "
+                    "setting the coefficient for the KL loss term to "
+                    "zero or increasing policy entropy."
                 )
         else:
             mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)
@@ -111,6 +127,7 @@ def _compute_loss_per_module(
             "mean_kl_loss": mean_kl_loss,
         }
 
+    @override(TorchRLTrainer)
     def _additional_update_per_module(
         self, module_id: str, sampled_kl_values: dict, timestep: int
     ) -> Mapping[str, Any]:
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index 9cd8a75c58f1..fd383e656128 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -455,6 +455,8 @@ def set_state(self, state: Mapping[str, Any]) -> None:
                 from `get_state`.
 
         """
+        # TODO (Kourosh): We have both get(set)_state and get(set)_weights. I think
+        # having both can become confusing. Can we simplify this API requirement?
         self.__check_if_build_called()
         # TODO: once we figure out the optimizer format, we can set/get the state
         self._module.set_state(state.get("module_state", {}))
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index b3bbfb64897f..7744e0f0b0d8 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -8,7 +8,7 @@
     Hashable,
     Optional,
     Callable,
-    Set
+    Set,
 )
 
 from ray.rllib.core.rl_module.rl_module import (
@@ -155,7 +155,7 @@ def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any
         module_weights = self._module.get_state()
         if module_ids is None:
             return module_weights
-        
+
         return {k: v for k, v in module_weights.items() if k in module_ids}
 
     def set_weights(self, weights: Mapping[str, Any]) -> None:
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index dd22d5502dba..1a930b488e16 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -108,8 +108,8 @@ def fit(
         """Do `num_iters` minibatch updates given the original batch.
 
         Given a batch of episodes you can use this method to take more
-        than one backward pass on the batch. The same minibatch_size and num_iters gets 
-        will be used for all module ids (previously known as policies) in the 
+        than one backward pass on the batch. The same minibatch_size and num_iters gets
+        will be used for all module ids (previously known as policies) in the
         multiagent batch
 
         Args:
@@ -125,7 +125,8 @@ def fit(
         start = {mid: 0 for mid in batch.policy_batches.keys()}
         num_covered_epochs = {mid: 0 for mid in batch.policy_batches.keys()}
         results = []
-        # loop until the number of passes through all modules batches reaches the num_iters
+        # loop until the number of passes through all modules batches reaches the
+        # num_iters
         while min(num_covered_epochs.values()) < num_iters:
             minibatch = {}
             for module_id, module_batch in batch.policy_batches.items():
@@ -155,6 +156,9 @@ def fit(
             results.append(self.update(minibatch))
 
         # return the average of the results using tree map
+        # TODO (Kourosh): There should be system for reporting back metrics from
+        # RLTrainers. Some metrics should be averaged, while some should be just
+        # concatenated.
         return tree.map_structure(lambda *x: np.mean(x), *results)
 
     def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
@@ -289,7 +293,6 @@ def set_weights(self, weights) -> None:
             self._trainer.set_weights(weights)
         else:
             ray.get([worker.set_weights.remote(weights) for worker in self._workers])
-            
 
     def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any]:
         if self.is_local:
@@ -303,10 +306,8 @@ def get_state(self) -> List[Mapping[ModuleID, Mapping[str, Any]]]:
         if self.is_local:
             return self._trainer.get_state()
         else:
-            refs = []
-            for worker in self._workers:
-                refs.append(worker.get_state.remote())
-            return ray.get(refs)[0]
+            worker = next(iter(self._workers))
+            return ray.get(worker.get_state.remote())
 
     def set_state(self, state: List[Mapping[ModuleID, Mapping[str, Any]]]) -> None:
         """Sets the states of the RLTrainers.
diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py
index fc6c85830dbe..74af3218b47c 100644
--- a/rllib/evaluation/worker_set.py
+++ b/rllib/evaluation/worker_set.py
@@ -382,7 +382,7 @@ def num_remote_worker_restarts(self) -> int:
     def sync_weights(
         self,
         policies: Optional[List[PolicyID]] = None,
-        from_worker: Optional[Union[RolloutWorker, TrainerRunner]] = None,
+        from_worker_or_trainer: Optional[Union[RolloutWorker, TrainerRunner]] = None,
         to_worker_indices: Optional[List[int]] = None,
         global_vars: Optional[Dict[str, TensorType]] = None,
         timeout_seconds: Optional[int] = 0,
@@ -392,9 +392,9 @@ def sync_weights(
         Args:
             policies: Optional list of PolicyIDs to sync weights for.
                 If None (default), sync weights to/from all policies.
-            from_worker: Optional local RolloutWorker instance or TrainerRunner
-                instance to sync from.
-                If None (default), sync from this WorkerSet's local worker.
+            from_worker_or_trainer: Optional local RolloutWorker instance or
+                TrainerRunner instance to sync from. If None (default),
+                sync from this WorkerSet's local worker.
             to_worker_indices: Optional list of worker indices to sync the
                 weights to. If None (default), sync to all remote workers.
             global_vars: An optional global vars dict to set this
@@ -404,16 +404,17 @@ def sync_weights(
                 for any sync calls to finish). This significantly improves
                 algorithm performance.
         """
-        if self.local_worker() is None and from_worker is None:
+        if self.local_worker() is None and from_worker_or_trainer is None:
             raise TypeError(
                 "No `local_worker` in WorkerSet, must provide `from_worker` "
                 "arg in `sync_weights()`!"
             )
 
-        # Only sync if we have remote workers or `from_worker` is provided.
+        # Only sync if we have remote workers or `from_worker_or_trainer` is provided.
         weights = None
-        if self.num_remote_workers() or from_worker is not None:
-            worker_or_trainer = from_worker or self.local_worker()
+        worker_or_trainer = None
+        if self.num_remote_workers() or from_worker_or_trainer is not None:
+            worker_or_trainer = from_worker_or_trainer or self.local_worker()
             weights = worker_or_trainer.get_weights(policies)
 
             def set_weight(w):
diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py
index 667d116698f1..2ea7896e04f0 100644
--- a/rllib/policy/policy.py
+++ b/rllib/policy/policy.py
@@ -1201,7 +1201,7 @@ def _get_num_gpus_for_policy(self) -> int:
             # If in local debugging mode, and _fake_gpus is not on.
             num_gpus = 0
         elif worker_idx == 0:
-            # if we are in the new trainer wold num_gpus is only for trainer
+            # if we are in the new rl trainer world num_gpus is deprecated.
             # so use num_gpus_per_worker for policy sampling
             if self.config["_enable_rl_trainer_api"]:
                 num_gpus = self.config["num_gpus_per_worker"]

From d40c48de98a2a24041eabb56d7fe856f9121f67a Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 30 Jan 2023 12:54:13 -0800
Subject: [PATCH 091/112] fix from_worker_or_trainer renaming issue

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py | 8 ++++----
 rllib/algorithms/ppo/ppo.py   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 2cbe33c5a041..24cb4f2b7671 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -862,7 +862,7 @@ def evaluate(
         # Sync weights to the evaluation WorkerSet.
         if self.evaluation_workers is not None:
             self.evaluation_workers.sync_weights(
-                from_worker=self.workers.local_worker()
+                from_worker_or_trainer=self.workers.local_worker()
             )
             self._sync_filters_if_needed(
                 from_worker=self.workers.local_worker(),
@@ -1380,11 +1380,11 @@ def training_step(self) -> ResultDict:
             # TODO (Avnish): Implement this on trainer_runner.get_weights().
             # TODO (Kourosh): figure out how we are going to sync MARLModule
             # weights to MARLModule weights under the policy_map objects?
-            from_worker = None
+            from_worker_or_trainer = None
             if self.config._enable_rl_trainer_api:
-                from_worker = self.trainer_runner
+                from_worker_or_trainer = self.trainer_runner
             self.workers.sync_weights(
-                from_worker=from_worker,
+                from_worker_or_trainer=from_worker_or_trainer,
                 policies=list(train_results.keys()),
                 global_vars=global_vars,
             )
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index eaf4b4f9cab9..6bd9b7440c08 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -426,7 +426,7 @@ def training_step(self) -> ResultDict:
                     # sync weights from trainer_runner to all rollout workers
                     from_worker_or_trainer = self.trainer_runner
                 self.workers.sync_weights(
-                    from_worker=from_worker_or_trainer,
+                    from_worker_or_trainer=from_worker_or_trainer,
                     policies=policies_to_update,
                     global_vars=global_vars,
                 )

From b0fed291989f74348d671513a30f839e2f4d7bac Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 30 Jan 2023 15:38:20 -0800
Subject: [PATCH 092/112] fixed tests

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD                                   |  8 +++
 rllib/algorithms/ppo/ppo.py                   | 12 ++++
 rllib/algorithms/ppo/ppo_rl_trainer_config.py | 21 +++++++
 .../ppo/torch/ppo_torch_rl_trainer.py         |  1 +
 .../core/rl_trainer/reduce_result_dict_fn.py  | 17 +++++
 rllib/core/rl_trainer/rl_trainer.py           |  4 ++
 .../rl_trainer/tests/test_trainer_runner.py   | 23 +++----
 rllib/core/rl_trainer/trainer_runner.py       | 62 ++++++++++++++-----
 rllib/policy/policy.py                        |  3 +-
 9 files changed, 124 insertions(+), 27 deletions(-)
 create mode 100644 rllib/algorithms/ppo/ppo_rl_trainer_config.py
 create mode 100644 rllib/core/rl_trainer/reduce_result_dict_fn.py

diff --git a/rllib/BUILD b/rllib/BUILD
index 0d3ef01f0019..828e8d3e8a1d 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1133,6 +1133,14 @@ py_test(
     srcs = ["algorithms/ppo/tests/test_ppo_rl_module.py"]
 )
 
+
+py_test(
+    name = "test_ppo_rl_trainer",
+    tags = ["team:rllib", "algorithms_dir"],
+    size = "medium",
+    srcs = ["algorithms/ppo/tests/test_ppo_rl_trainer.py"]
+)
+
 # PPO Reproducibility
 py_test(
     name = "test_repro_ppo",
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 6bd9b7440c08..e9fd7bac8ea4 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -16,6 +16,7 @@
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
 from ray.rllib.algorithms.pg import PGConfig
+from ray.rllib.algorithms.ppo.ppo_rl_trainer_config import PPORLTrainerHPs
 from ray.rllib.execution.rollout_ops import (
     standardize_fields,
 )
@@ -91,6 +92,7 @@ def __init__(self, algo_class=None):
         # fmt: off
         # __sphinx_doc_begin__
         # PPO specific settings:
+        self._rl_trainer_hps = PPORLTrainerHPs()
         self.use_critic = True
         self.use_gae = True
         self.lambda_ = 1.0
@@ -214,12 +216,16 @@ def training(
             self.lr_schedule = lr_schedule
         if use_critic is not NotProvided:
             self.use_critic = use_critic
+            # TODO (Kourosh) This is experimental. Set rl_trainer_hps parameters as 
+            # well. Don't forget to remote .use_critic from algorithm config.
+            self._rl_trainer_hps.use_critic = use_critic
         if use_gae is not NotProvided:
             self.use_gae = use_gae
         if lambda_ is not NotProvided:
             self.lambda_ = lambda_
         if kl_coeff is not NotProvided:
             self.kl_coeff = kl_coeff
+            self._rl_trainer_hps.kl_coeff = kl_coeff
         if sgd_minibatch_size is not NotProvided:
             self.sgd_minibatch_size = sgd_minibatch_size
         if num_sgd_iter is not NotProvided:
@@ -228,18 +234,24 @@ def training(
             self.shuffle_sequences = shuffle_sequences
         if vf_loss_coeff is not NotProvided:
             self.vf_loss_coeff = vf_loss_coeff
+            self._rl_trainer_hps.vf_loss_coeff = vf_loss_coeff
         if entropy_coeff is not NotProvided:
             self.entropy_coeff = entropy_coeff
+            self._rl_trainer_hps.entropy_coeff = entropy_coeff
         if entropy_coeff_schedule is not NotProvided:
             self.entropy_coeff_schedule = entropy_coeff_schedule
+            self._rl_trainer_hps.entropy_coeff_schedule = entropy_coeff_schedule
         if clip_param is not NotProvided:
             self.clip_param = clip_param
+            self._rl_trainer_hps.clip_param = clip_param
         if vf_clip_param is not NotProvided:
             self.vf_clip_param = vf_clip_param
+            self._rl_trainer_hps.vf_clip_param = vf_clip_param
         if grad_clip is not NotProvided:
             self.grad_clip = grad_clip
         if kl_target is not NotProvided:
             self.kl_target = kl_target
+            self._rl_trainer_hps.kl_target = kl_target
 
         return self
 
diff --git a/rllib/algorithms/ppo/ppo_rl_trainer_config.py b/rllib/algorithms/ppo/ppo_rl_trainer_config.py
new file mode 100644
index 000000000000..228157783378
--- /dev/null
+++ b/rllib/algorithms/ppo/ppo_rl_trainer_config.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+from ray.rllib.core.rl_trainer.rl_trainer import RLTrainerHPs
+
+@dataclass
+class PPORLTrainerHPs(RLTrainerHPs):
+    """Hyperparameters for the PPO RL Trainer"""
+
+    kl_coeff: float = 0.2
+    kl_target: float = 0.01
+    use_critic: bool = True
+    clip_param: float = 0.3
+    vf_clip_param: float = 10.0
+    entropy_coeff: float = 0.0
+    vf_loss_coeff: float = 1.0
+
+    # experimental placeholder for things that could be part of the base RLTrainerHPs
+    lr_schedule: Optional[List[List[Union[int, float]]]] = None
+    entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = None
+    
\ No newline at end of file
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index ab65abf22eb8..60ee2726afb3 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -16,6 +16,7 @@
 logger = logging.getLogger(__name__)
 
 
+
 class PPOTorchRLTrainer(TorchRLTrainer):
     """Implements PPO loss / update logic on top of TorchRLTrainer.
 
diff --git a/rllib/core/rl_trainer/reduce_result_dict_fn.py b/rllib/core/rl_trainer/reduce_result_dict_fn.py
new file mode 100644
index 000000000000..f5c3929dbb8b
--- /dev/null
+++ b/rllib/core/rl_trainer/reduce_result_dict_fn.py
@@ -0,0 +1,17 @@
+"""The following is set of default rllib reduction methods for ResultDicts"""
+
+from typing import List
+import numpy as np
+import tree # pip install dm-tree
+from ray.rllib.utils.typing import ResultDict
+
+def _reduce_mean_results(results: List[ResultDict]) -> ResultDict:
+    """Takes the average of all the leaves in the result dict
+
+    Args:
+        results: list of result dicts to average
+    
+    Returns:
+        Averaged result dict
+    """
+    return tree.map_structure(lambda *x: np.mean(x), *results)
diff --git a/rllib/core/rl_trainer/rl_trainer.py b/rllib/core/rl_trainer/rl_trainer.py
index fd383e656128..d8b9d1cc3ff6 100644
--- a/rllib/core/rl_trainer/rl_trainer.py
+++ b/rllib/core/rl_trainer/rl_trainer.py
@@ -69,6 +69,10 @@ class RLTrainerHPs:
 
     When creating a new RLTrainer, the new hyper-parameters have to be defined by
     subclassing this class and adding the new hyper-parameters as fields.
+
+    # TODO (Kourosh): The things that could be part of the base class:
+    - lr_schedule
+    - grad_clip
     """
 
     pass
diff --git a/rllib/core/rl_trainer/tests/test_trainer_runner.py b/rllib/core/rl_trainer/tests/test_trainer_runner.py
index 67a511b6d5bc..ca569b3bdf3b 100644
--- a/rllib/core/rl_trainer/tests/test_trainer_runner.py
+++ b/rllib/core/rl_trainer/tests/test_trainer_runner.py
@@ -60,12 +60,12 @@ def test_trainer_runner_local(self):
             local_trainer.build()
 
             # make the state of the trainer and the local runner identical
-            local_trainer.set_state(runner.get_state()[0])
+            local_trainer.set_state(runner.get_state())
 
-            reader = get_cartpole_dataset_reader(batch_size=500)
+            reader = get_cartpole_dataset_reader(batch_size=512)
             batch = reader.next()
             batch = batch.as_multi_agent()
-            check(local_trainer.update(batch), runner.update(batch)[0])
+            check(local_trainer.update(batch), runner.update(batch))
 
             new_module_id = "test_module"
 
@@ -73,16 +73,16 @@ def test_trainer_runner_local(self):
             add_module_to_runner_or_trainer(fw, env, new_module_id, local_trainer)
 
             # make the state of the trainer and the local runner identical
-            local_trainer.set_state(runner.get_state()[0])
+            local_trainer.set_state(runner.get_state())
 
             # do another update
             batch = reader.next()
             ma_batch = MultiAgentBatch(
                 {new_module_id: batch, DEFAULT_POLICY_ID: batch}, env_steps=batch.count
             )
-            check(local_trainer.update(ma_batch), runner.update(ma_batch)[0])
+            check(local_trainer.update(ma_batch), runner.update(ma_batch))
 
-            check(local_trainer.get_state(), runner.get_state()[0])
+            check(local_trainer.get_state(), runner.get_state())
 
             # make sure the runner resources are freed up so that we don't autoscale
             del runner
@@ -110,7 +110,7 @@ def test_update_multigpu(self):
             min_loss = float("inf")
             for iter_i in range(1000):
                 batch = reader.next()
-                results = runner.update(batch.as_multi_agent())
+                results = runner.update(batch.as_multi_agent(), reduce_fn=None)
 
                 loss = np.mean([res["loss"]["total_loss"] for res in results])
                 min_loss = min(loss, min_loss)
@@ -147,11 +147,11 @@ def test_add_remove_module(self):
             env = gym.make("CartPole-v1")
             scaling_config = self.scaling_configs[scaling_mode]
             runner = get_trainer_runner(fw, env, scaling_config)
-            reader = get_cartpole_dataset_reader(batch_size=500)
+            reader = get_cartpole_dataset_reader(batch_size=512)
             batch = reader.next()
 
             # update once with the default policy
-            results = runner.update(batch.as_multi_agent())
+            results = runner.update(batch.as_multi_agent(), reduce_fn=None)
             module_ids_before_add = {DEFAULT_POLICY_ID}
             new_module_id = "test_module"
 
@@ -162,7 +162,8 @@ def test_add_remove_module(self):
             results = runner.update(
                 MultiAgentBatch(
                     {new_module_id: batch, DEFAULT_POLICY_ID: batch}, batch.count
-                )
+                ),
+                reduce_fn=None,
             )
 
             # check that module weights are updated across workers and synchronized
@@ -185,7 +186,7 @@ def test_add_remove_module(self):
             runner.remove_module(module_id=new_module_id)
 
             # run training without the test_module
-            results = runner.update(batch.as_multi_agent())
+            results = runner.update(batch.as_multi_agent(), reduce_fn=None)
 
             # check that module weights are updated across workers and synchronized
             for i in range(1, len(results)):
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 1a930b488e16..a5013527af7e 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -5,6 +5,8 @@
 
 import ray
 
+from ray.rllib.utils.typing import ResultDict
+from ray.rllib.core.rl_trainer.reduce_result_dict_fn import _reduce_mean_results
 from ray.rllib.policy.sample_batch import concat_samples
 from ray.rllib.core.rl_module.rl_module import (
     RLModule,
@@ -103,7 +105,12 @@ def is_local(self) -> bool:
         return self._is_local
 
     def fit(
-        self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int
+        self, 
+        batch: MultiAgentBatch, 
+        *,
+        minibatch_size: int, 
+        num_iters: int,
+        reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results
     ) -> Mapping[str, Any]:
         """Do `num_iters` minibatch updates given the original batch.
 
@@ -117,6 +124,7 @@ def fit(
             minibatch_size: The size of the minibatch to use for each update.
             num_iters: The number of complete passes over all the sub-batches
                 in the input multi-agent batch.
+            reduce_fn: See `update()` documenation for more details.
 
         Returns:
             A dictionary of results summarizing the statistics of the updates.
@@ -153,7 +161,7 @@ def fit(
             # clear what the correct value should be. Since training does not depend on
             # this it will be fine for now.
             minibatch = MultiAgentBatch(minibatch, len(batch))
-            results.append(self.update(minibatch))
+            results.append(self.update(minibatch, reduce_fn=reduce_fn))
 
         # return the average of the results using tree map
         # TODO (Kourosh): There should be system for reporting back metrics from
@@ -161,19 +169,34 @@ def fit(
         # concatenated.
         return tree.map_structure(lambda *x: np.mean(x), *results)
 
-    def update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
-        """Do one gradient based update to the RLTrainer(s) maintained by this TrainerRunner.
+    def update(self, 
+        batch: MultiAgentBatch,
+        *,
+        reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results
+    ) -> List[Mapping[str, Any]]:
+        """Do one gradient based update to the RLTrainer(s).
 
         Args:
             batch: The data to use for the update.
+            reduce_fn: A function to reduce the results from a list of RLTrainer Actors 
+                into a single result. This can be any arbitrary function that takes a 
+                list of dictionaries and returns a single dictionary. For example you 
+                can either take an average (default) or concatenate the results (for 
+                example for metrics) or be more selective about you want to report back 
+                to the algorithm's training_step. If None is passed, the results will 
+                not get reduced.
 
         Returns:
             A list of dictionaries of results from the updates from the RLTrainer(s)
         """
         if self.is_local:
-            return [self._trainer.update(batch)]
+            results = [self._trainer.update(batch)]
         else:
-            return self._distributed_update(batch)
+            results = self._distributed_update(batch)
+
+        if reduce_fn is None:
+            return results
+        return reduce_fn(results)
 
     def _distributed_update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]:
         """Do a gradient based update to the RLTrainers using DDP training.
@@ -203,10 +226,13 @@ def _distributed_update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]
             refs.append(worker.update.remote(new_batch))
 
         results = ray.get(refs)
-        # take an average across the result of all actors
-        return tree.map_structure(lambda *x: np.mean(x), *results)
+        return results
 
-    def additional_update(self, *args, **kwargs) -> List[Mapping[str, Any]]:
+    def additional_update(self, 
+        *,
+        reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results,
+        **kwargs
+    ) -> List[Mapping[str, Any]]:
         """Apply additional non-gradient based updates to the RLTrainers.
 
         For example, this could be used to do a polyak averaging update
@@ -215,7 +241,7 @@ def additional_update(self, *args, **kwargs) -> List[Mapping[str, Any]]:
         By default this is a pass through that calls `RLTrainer.additional_update`
 
         Args:
-            *args: Arguments to pass to each RLTrainer.
+            reduce_fn: See `update()` documentation for more details.
             **kwargs: Keyword arguments to pass to each RLTrainer.
 
         Returns:
@@ -223,12 +249,15 @@ def additional_update(self, *args, **kwargs) -> List[Mapping[str, Any]]:
         """
 
         if self.is_local:
-            return [self._trainer.additional_update(*args, **kwargs)]
+            results = [self._trainer.additional_update(**kwargs)]
         else:
             refs = []
             for worker in self._workers:
-                refs.append(worker.additional_update.remote(*args, **kwargs))
-            return ray.get(refs)
+                refs.append(worker.additional_update.remote(**kwargs))
+            results = ray.get(refs)
+        if reduce_fn is None:
+            return results
+        return reduce_fn(results)
 
     def add_module(
         self,
@@ -301,8 +330,11 @@ def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any
             worker = next(iter(self._workers))
             return ray.get(worker.get_weights.remote(module_ids))
 
-    def get_state(self) -> List[Mapping[ModuleID, Mapping[str, Any]]]:
-        """Get the states of the RLTrainers"""
+    def get_state(self) -> Mapping[ModuleID, Mapping[str, Any]]:
+        """Get the states of the first RLTrainers.
+        
+        This should be the same across RLTrainers
+        """
         if self.is_local:
             return self._trainer.get_state()
         else:
diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py
index 2ea7896e04f0..b49a08ec8eef 100644
--- a/rllib/policy/policy.py
+++ b/rllib/policy/policy.py
@@ -1203,7 +1203,8 @@ def _get_num_gpus_for_policy(self) -> int:
         elif worker_idx == 0:
             # if we are in the new rl trainer world num_gpus is deprecated.
             # so use num_gpus_per_worker for policy sampling
-            if self.config["_enable_rl_trainer_api"]:
+            # we need this .get() syntax here to ensure backwards compatibility.
+            if self.config.get("_enable_rl_trainer_api", False):
                 num_gpus = self.config["num_gpus_per_worker"]
             else:
                 # If head node, take num_gpus.

From b92eee9c78889c764153c19c8e0293ce4cd23a41 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 30 Jan 2023 15:40:03 -0800
Subject: [PATCH 093/112] fixed test_ppo_rl_trainer.py

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
index ee5362ca06bf..a7eb642a6697 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
@@ -97,7 +97,7 @@ def test_loss(self):
         trainer_runner.set_state(state_dict)
         results = trainer_runner.update(train_batch.as_multi_agent())
 
-        trainer_runner_loss = results[0]["loss"]["total_loss"]
+        trainer_runner_loss = results["loss"]["total_loss"]
 
         check(trainer_runner_loss, policy_loss)
 

From 1c3ccb562be743d6ca1de714ed401e5ced50fb2d Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 30 Jan 2023 17:17:45 -0800
Subject: [PATCH 094/112] rerunning ci

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/trainer_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index a5013527af7e..d053869cc208 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -194,6 +194,7 @@ def update(self,
         else:
             results = self._distributed_update(batch)
 
+        # TODO (Kourosh): Maybe we should use LearnerInfoBuilder() here? 
         if reduce_fn is None:
             return results
         return reduce_fn(results)

From 37c9fca716c3109ab3c96ca60b6f44b1b069e0e4 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 30 Jan 2023 17:18:24 -0800
Subject: [PATCH 095/112] lint

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/ppo.py                   |  2 +-
 rllib/algorithms/ppo/ppo_rl_trainer_config.py |  2 +-
 .../ppo/torch/ppo_torch_rl_trainer.py         |  1 -
 .../core/rl_trainer/reduce_result_dict_fn.py  |  5 +--
 rllib/core/rl_trainer/trainer_runner.py       | 34 ++++++++++---------
 5 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index e9fd7bac8ea4..22e4eaae9692 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -216,7 +216,7 @@ def training(
             self.lr_schedule = lr_schedule
         if use_critic is not NotProvided:
             self.use_critic = use_critic
-            # TODO (Kourosh) This is experimental. Set rl_trainer_hps parameters as 
+            # TODO (Kourosh) This is experimental. Set rl_trainer_hps parameters as
             # well. Don't forget to remote .use_critic from algorithm config.
             self._rl_trainer_hps.use_critic = use_critic
         if use_gae is not NotProvided:
diff --git a/rllib/algorithms/ppo/ppo_rl_trainer_config.py b/rllib/algorithms/ppo/ppo_rl_trainer_config.py
index 228157783378..2f616ca45787 100644
--- a/rllib/algorithms/ppo/ppo_rl_trainer_config.py
+++ b/rllib/algorithms/ppo/ppo_rl_trainer_config.py
@@ -3,6 +3,7 @@
 
 from ray.rllib.core.rl_trainer.rl_trainer import RLTrainerHPs
 
+
 @dataclass
 class PPORLTrainerHPs(RLTrainerHPs):
     """Hyperparameters for the PPO RL Trainer"""
@@ -18,4 +19,3 @@ class PPORLTrainerHPs(RLTrainerHPs):
     # experimental placeholder for things that could be part of the base RLTrainerHPs
     lr_schedule: Optional[List[List[Union[int, float]]]] = None
     entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = None
-    
\ No newline at end of file
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
index 60ee2726afb3..ab65abf22eb8 100644
--- a/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
+++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_trainer.py
@@ -16,7 +16,6 @@
 logger = logging.getLogger(__name__)
 
 
-
 class PPOTorchRLTrainer(TorchRLTrainer):
     """Implements PPO loss / update logic on top of TorchRLTrainer.
 
diff --git a/rllib/core/rl_trainer/reduce_result_dict_fn.py b/rllib/core/rl_trainer/reduce_result_dict_fn.py
index f5c3929dbb8b..47ca597d2b11 100644
--- a/rllib/core/rl_trainer/reduce_result_dict_fn.py
+++ b/rllib/core/rl_trainer/reduce_result_dict_fn.py
@@ -2,15 +2,16 @@
 
 from typing import List
 import numpy as np
-import tree # pip install dm-tree
+import tree  # pip install dm-tree
 from ray.rllib.utils.typing import ResultDict
 
+
 def _reduce_mean_results(results: List[ResultDict]) -> ResultDict:
     """Takes the average of all the leaves in the result dict
 
     Args:
         results: list of result dicts to average
-    
+
     Returns:
         Averaged result dict
     """
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index d053869cc208..21aa44ecf448 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -105,12 +105,12 @@ def is_local(self) -> bool:
         return self._is_local
 
     def fit(
-        self, 
-        batch: MultiAgentBatch, 
+        self,
+        batch: MultiAgentBatch,
         *,
-        minibatch_size: int, 
+        minibatch_size: int,
         num_iters: int,
-        reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results
+        reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results,
     ) -> Mapping[str, Any]:
         """Do `num_iters` minibatch updates given the original batch.
 
@@ -169,21 +169,22 @@ def fit(
         # concatenated.
         return tree.map_structure(lambda *x: np.mean(x), *results)
 
-    def update(self, 
+    def update(
+        self,
         batch: MultiAgentBatch,
         *,
-        reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results
+        reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results,
     ) -> List[Mapping[str, Any]]:
         """Do one gradient based update to the RLTrainer(s).
 
         Args:
             batch: The data to use for the update.
-            reduce_fn: A function to reduce the results from a list of RLTrainer Actors 
-                into a single result. This can be any arbitrary function that takes a 
-                list of dictionaries and returns a single dictionary. For example you 
-                can either take an average (default) or concatenate the results (for 
-                example for metrics) or be more selective about you want to report back 
-                to the algorithm's training_step. If None is passed, the results will 
+            reduce_fn: A function to reduce the results from a list of RLTrainer Actors
+                into a single result. This can be any arbitrary function that takes a
+                list of dictionaries and returns a single dictionary. For example you
+                can either take an average (default) or concatenate the results (for
+                example for metrics) or be more selective about you want to report back
+                to the algorithm's training_step. If None is passed, the results will
                 not get reduced.
 
         Returns:
@@ -194,7 +195,7 @@ def update(self,
         else:
             results = self._distributed_update(batch)
 
-        # TODO (Kourosh): Maybe we should use LearnerInfoBuilder() here? 
+        # TODO (Kourosh): Maybe we should use LearnerInfoBuilder() here?
         if reduce_fn is None:
             return results
         return reduce_fn(results)
@@ -229,10 +230,11 @@ def _distributed_update(self, batch: MultiAgentBatch) -> List[Mapping[str, Any]]
         results = ray.get(refs)
         return results
 
-    def additional_update(self, 
+    def additional_update(
+        self,
         *,
         reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results,
-        **kwargs
+        **kwargs,
     ) -> List[Mapping[str, Any]]:
         """Apply additional non-gradient based updates to the RLTrainers.
 
@@ -333,7 +335,7 @@ def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any
 
     def get_state(self) -> Mapping[ModuleID, Mapping[str, Any]]:
         """Get the states of the first RLTrainers.
-        
+
         This should be the same across RLTrainers
         """
         if self.is_local:

From 168724155e4533f7e3d710810dfc398a2921f524 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 30 Jan 2023 17:31:55 -0800
Subject: [PATCH 096/112] added TODO

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/trainer_runner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 21aa44ecf448..ed1400765980 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -133,6 +133,8 @@ def fit(
         start = {mid: 0 for mid in batch.policy_batches.keys()}
         num_covered_epochs = {mid: 0 for mid in batch.policy_batches.keys()}
         results = []
+        # TODO (Kourosh): One data transfer is probably better than many for each mini
+        # batch. How should we do this?
         # loop until the number of passes through all modules batches reaches the
         # num_iters
         while min(num_covered_epochs.values()) < num_iters:

From d3ee81a1c9a6837ee56d937e1efecc5479350abd Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 31 Jan 2023 08:48:22 -0800
Subject: [PATCH 097/112] empty commit

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>

From 3ff0668ddf4a705c2579889ef395474db571de58 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 31 Jan 2023 12:56:29 -0800
Subject: [PATCH 098/112] fixed weights to numpy

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/core/rl_trainer/trainer_runner.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index ed1400765980..b0125669b41e 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -6,6 +6,7 @@
 import ray
 
 from ray.rllib.utils.typing import ResultDict
+from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.core.rl_trainer.reduce_result_dict_fn import _reduce_mean_results
 from ray.rllib.policy.sample_batch import concat_samples
 from ray.rllib.core.rl_module.rl_module import (
@@ -330,10 +331,12 @@ def set_weights(self, weights) -> None:
 
     def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any]:
         if self.is_local:
-            return self._trainer.get_weights(module_ids)
+            weights = self._trainer.get_weights(module_ids)
         else:
             worker = next(iter(self._workers))
-            return ray.get(worker.get_weights.remote(module_ids))
+            weights = ray.get(worker.get_weights.remote(module_ids))
+        
+        return convert_to_numpy(weights)
 
     def get_state(self) -> Mapping[ModuleID, Mapping[str, Any]]:
         """Get the states of the first RLTrainers.

From d91eca528c5f795917e7e186a02f011e4e7c1f7c Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Tue, 31 Jan 2023 08:22:28 -0800
Subject: [PATCH 099/112] [release] minor fix to pytorch_pbt_failure test when
 using gpu. (#32070)

Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com>
---
 .../train/examples/pytorch/tune_cifar_torch_pbt_example.py | 7 +++++--
 rllib/env/multi_agent_env.py                               | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
index 90846eb84824..196051971129 100644
--- a/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
+++ b/python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py
@@ -70,6 +70,10 @@ def train_func(config):
 
     model = resnet18()
 
+    # Note that `prepare_model` needs to be called before setting optimizer.
+    if not session.get_checkpoint():  # fresh start
+        model = train.torch.prepare_model(model)
+
     # Create optimizer.
     optimizer_config = {
         "lr": config.get("lr"),
@@ -84,6 +88,7 @@ def train_func(config):
         # Load in model
         model_state = checkpoint_dict["model"]
         model.load_state_dict(model_state)
+        model = train.torch.prepare_model(model)
 
         # Load in optimizer
         optimizer_state = checkpoint_dict["optimizer_state_dict"]
@@ -97,8 +102,6 @@ def train_func(config):
         checkpoint_epoch = checkpoint_dict["epoch"]
         starting_epoch = checkpoint_epoch + 1
 
-    model = train.torch.prepare_model(model)
-
     # Load in training and validation data.
     transform_train = transforms.Compose(
         [
diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py
index dd78c4536f05..0cc307075771 100644
--- a/rllib/env/multi_agent_env.py
+++ b/rllib/env/multi_agent_env.py
@@ -536,6 +536,12 @@ def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
         @override(MultiAgentEnv)
         def step(self, action_dict):
             obs, rew, terminated, truncated, info = {}, {}, {}, {}, {}
+
+            # the environment is expecting actions for all sub-envs
+            if set(action_dict) != self._agent_ids:
+                missings = self._agent_ids - set(action_dict)
+                raise ValueError(f"Missing actions for agent ids: {missings}")
+
             for i, action in action_dict.items():
                 obs[i], rew[i], terminated[i], truncated[i], info[i] = self.envs[
                     i

From e29021de8dbfdf4f344de387f221ab877ecaa7d6 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 31 Jan 2023 14:03:19 -0800
Subject: [PATCH 100/112] error out when no agent is passed in in the
 indepenent MARL case

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/env/multi_agent_env.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py
index 0cc307075771..a65b8c316073 100644
--- a/rllib/env/multi_agent_env.py
+++ b/rllib/env/multi_agent_env.py
@@ -537,10 +537,11 @@ def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
         def step(self, action_dict):
             obs, rew, terminated, truncated, info = {}, {}, {}, {}, {}
 
-            # the environment is expecting actions for all sub-envs
-            if set(action_dict) != self._agent_ids:
-                missings = self._agent_ids - set(action_dict)
-                raise ValueError(f"Missing actions for agent ids: {missings}")
+            # the environment is expecting action for at least one agent
+            if len(action_dict) == 0:
+                raise ValueError(
+                    "The environment is expecting action for at least one agent."
+                )
 
             for i, action in action_dict.items():
                 obs[i], rew[i], terminated[i], truncated[i], info[i] = self.envs[

From f28a3857b506e4dc9fc665a11f9b3299e2e17691 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 31 Jan 2023 15:01:08 -0800
Subject: [PATCH 101/112] 1. set resources for trainable 2. convert_to_numpy
 weights on RLTrainer get_weights()

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py                 |  33 ++++-
 .../core/rl_trainer/torch/torch_rl_trainer.py |   5 +-
 rllib/core/rl_trainer/trainer_runner.py       |   2 +-
 .../rl_trainer/multi_agent_cartpole_ppo.py    | 122 ++++++++++++++++++
 4 files changed, 156 insertions(+), 6 deletions(-)
 create mode 100644 rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 24cb4f2b7671..7eaf7a5e9a53 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -2136,10 +2136,13 @@ def default_resource_request(
         eval_cf.freeze()
 
         # resources for local worker
-        local_worker = {
-            "CPU": cf.num_cpus_for_local_worker,
-            "GPU": 0 if cf._fake_gpus else cf.num_gpus,
-        }
+        if cf._enable_rl_trainer_api:
+            local_worker = {"CPU": cf.num_cpus_for_local_worker, "GPU": 0}
+        else:
+            local_worker = {
+                "CPU": cf.num_cpus_for_local_worker,
+                "GPU": 0 if cf._fake_gpus else cf.num_gpus,
+            }
 
         bundles = [local_worker]
 
@@ -2183,6 +2186,28 @@ def default_resource_request(
 
         bundles += rollout_workers + evaluation_bundle
 
+        if cf._enable_rl_trainer_api:
+            # resources for the trainer
+            if cf.num_trainer_workers == 0:
+                # if num_trainer_workers is 0, then we need to allocate one gpu if
+                # num_gpus_per_trainer_worker is greater than 0.
+                trainer_bundle = [
+                    {
+                        "CPU": cf.num_cpus_per_trainer_worker,
+                        "GPU": int(cf.num_gpus_per_trainer_worker > 0),
+                    }
+                ]
+            else:
+                trainer_bundle = [
+                    {
+                        "CPU": cf.num_cpus_per_trainer_worker,
+                        "GPU": cf.num_gpus_per_trainer_worker,
+                    }
+                    for _ in range(cf.num_trainer_workers)
+                ]
+
+            bundles += trainer_bundle
+
         # Return PlacementGroupFactory containing all needed resources
         # (already properly defined as device bundles).
         return PlacementGroupFactory(
diff --git a/rllib/core/rl_trainer/torch/torch_rl_trainer.py b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
index 7744e0f0b0d8..1ce1762efe74 100644
--- a/rllib/core/rl_trainer/torch/torch_rl_trainer.py
+++ b/rllib/core/rl_trainer/torch/torch_rl_trainer.py
@@ -29,6 +29,7 @@
 from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+from ray.rllib.utils.numpy import convert_to_numpy
 from ray.rllib.utils.typing import TensorType
 from ray.rllib.utils.nested_dict import NestedDict
 from ray.rllib.utils.framework import try_import_torch
@@ -156,7 +157,9 @@ def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any
         if module_ids is None:
             return module_weights
 
-        return {k: v for k, v in module_weights.items() if k in module_ids}
+        return convert_to_numpy(
+            {k: v for k, v in module_weights.items() if k in module_ids}
+        )
 
     def set_weights(self, weights: Mapping[str, Any]) -> None:
         """Sets the state of the underlying MultiAgentRLModule"""
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index b0125669b41e..7e1841b5cb6e 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -335,7 +335,7 @@ def get_weights(self, module_ids: Optional[Set[str]] = None) -> Mapping[str, Any
         else:
             worker = next(iter(self._workers))
             weights = ray.get(worker.get_weights.remote(module_ids))
-        
+
         return convert_to_numpy(weights)
 
     def get_state(self) -> Mapping[ModuleID, Mapping[str, Any]]:
diff --git a/rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py b/rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py
new file mode 100644
index 000000000000..08172bcea1c3
--- /dev/null
+++ b/rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py
@@ -0,0 +1,122 @@
+"""Simple example of setting up a multi-agent policy mapping.
+
+Control the number of agents and policies via --num-agents and --num-policies.
+
+This works with hundreds of agents and policies, but note that initializing
+many TF policies will take some time.
+
+Also, TF evals might slow down with large numbers of policies. To debug TF
+execution, set the TF_TIMELINE_DIR environment variable.
+"""
+
+import argparse
+import os
+import random
+
+import ray
+from ray import tune, air
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.test_utils import check_learning_achieved
+
+
+tf1, tf, tfv = try_import_tf()
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--num-agents", type=int, default=4)
+parser.add_argument("--num-policies", type=int, default=2)
+parser.add_argument(
+    "--framework",
+    choices=["tf2", "torch"],  # tf will be deprecated with the new RLTrainer stack
+    default="torch",
+    help="The DL framework specifier.",
+)
+
+parser.add_argument(
+    "--num-gpus",
+    type=int,
+    default=int(os.environ.get("RLLIB_NUM_GPUS", "0")),
+    help="Number of GPUs to use for training.",
+)
+
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+
+parser.add_argument(
+    "--stop-iters", type=int, default=20, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=50000, help="Number of timesteps to train."
+)
+
+parser.add_argument(
+    "--stop-reward-per-agent",
+    type=float,
+    default=150.0,
+    help="Min. reward per agent at which we stop training.",
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    ray.init()
+
+    # Each policy can have a different configuration (including custom model).
+    def gen_policy(i):
+        gammas = [0.95, 0.99]
+        # just change the gammas between the two policies.
+        # changing the module is not a critical part of this example.
+        # the important part is that the policies are different.
+        config = {
+            "gamma": gammas[i % len(gammas)],
+        }
+
+        return PolicySpec(config=config)
+
+    # Setup PPO with an ensemble of `num_policies` different policies.
+    policies = {"policy_{}".format(i): gen_policy(i) for i in range(args.num_policies)}
+    policy_ids = list(policies.keys())
+
+    def policy_mapping_fn(agent_id, episode, worker, **kwargs):
+        pol_id = random.choice(policy_ids)
+        return pol_id
+
+    scaling_config = {
+        "num_trainer_workers": args.num_gpus,
+        "num_gpus_per_trainer_worker": int(args.num_gpus > 0),
+    }
+
+    config = (
+        PPOConfig()
+        .rollouts(rollout_fragment_length=500)
+        .environment(MultiAgentCartPole, env_config={"num_agents": args.num_agents})
+        .framework(args.framework)
+        .training(num_sgd_iter=10)
+        .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn)
+        .rl_module(_enable_rl_module_api=True)
+        .training(_enable_rl_trainer_api=True)
+        .resources(**scaling_config)
+    )
+
+    stop = {
+        "episode_reward_mean": args.stop_reward_per_agent * args.num_agents,
+        "timesteps_total": args.stop_timesteps,
+        "training_iteration": args.stop_iters,
+    }
+
+    results = tune.Tuner(
+        "PPO",
+        param_space=config.to_dict(),
+        run_config=air.RunConfig(stop=stop, verbose=1),
+    ).fit()
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+    ray.shutdown()

From 993932fe627df0c4bc2b6ef8894eedd08cda768d Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 31 Jan 2023 15:29:40 -0800
Subject: [PATCH 102/112] added examples as a unittest to BUILD kite

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 .buildkite/pipeline.ml.yml |  2 +-
 rllib/BUILD                | 39 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
index 7db645a5b83e..3d095e7a932b 100644
--- a/.buildkite/pipeline.ml.yml
+++ b/.buildkite/pipeline.ml.yml
@@ -177,7 +177,7 @@
     - RLLIB_TESTING=1 ./ci/env/install-dependencies.sh
     - ./ci/env/env_info.sh
     - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --build_tests_only
-      --test_tag_filters=examples,-multi_gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/...
+      --test_tag_filters=examples,-multi_gpu,-gpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/...
 
 - label: ":brain: RLlib: tests/ dir"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_DIRECTLY_AFFECTED"]
diff --git a/rllib/BUILD b/rllib/BUILD
index d622d0fad4d5..110dc8bda2f8 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -3848,6 +3848,45 @@ py_test(
     ]
 )
 
+# --------------------------------------------------------------------
+# examples/rl_trainer directory
+#
+# Tag: rlm
+# 
+# Description: These are RLlib tests for the new multi-gpu enabled 
+# training stack via RLTrainers.
+#
+# NOTE: Add tests alphabetically to this list.
+# --------------------------------------------------------------------
+
+py_test(
+    name = "examples/rl_trainer/multi_agent_cartpole_ppo_torch",
+    main = "examples/rl_trainer/multi_agent_cartpole_ppo.py",
+    tags = ["team:rllib", "exclusive", "examples", "no-gpu"],
+    size = "medium",
+    srcs = ["examples/rl_trainer/multi_agent_cartpole_ppo.py"],
+    args = ["--as-test", "--framework=torch", "--num-gpus=0"]
+)
+
+py_test(
+    name = "examples/rl_trainer/multi_agent_cartpole_ppo_torch_gpu",
+    main = "examples/rl_trainer/multi_agent_cartpole_ppo.py",
+    tags = ["team:rllib", "exclusive", "examples", "gpu"],
+    size = "medium",
+    srcs = ["examples/rl_trainer/multi_agent_cartpole_ppo.py"],
+    args = ["--as-test", "--framework=torch", "--num-gpus=1"]
+)
+
+
+py_test(
+    name = "examples/rl_trainer/multi_agent_cartpole_ppo_torch_gpu",
+    main = "examples/rl_trainer/multi_agent_cartpole_ppo.py",
+    tags = ["team:rllib", "exclusive", "examples", "multi-gpu"],
+    size = "medium",
+    srcs = ["examples/rl_trainer/multi_agent_cartpole_ppo.py"],
+    args = ["--as-test", "--framework=torch", "--num-gpus=2"]
+)
+
 # --------------------------------------------------------------------
 # examples/documentation directory
 #

From 05c82977a9fb32b38cb79e9a5cbbe2d0470c8bdc Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 31 Jan 2023 15:58:55 -0800
Subject: [PATCH 103/112] fixed test name conflict

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 110dc8bda2f8..adf6fd6648e8 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -3879,7 +3879,7 @@ py_test(
 
 
 py_test(
-    name = "examples/rl_trainer/multi_agent_cartpole_ppo_torch_gpu",
+    name = "examples/rl_trainer/multi_agent_cartpole_ppo_torch_multi_gpu",
     main = "examples/rl_trainer/multi_agent_cartpole_ppo.py",
     tags = ["team:rllib", "exclusive", "examples", "multi-gpu"],
     size = "medium",

From 839ff90bf666cb92f707069438f0eeb362b316dd Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 31 Jan 2023 16:00:57 -0800
Subject: [PATCH 104/112] removed the wrong tag from docs

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index adf6fd6648e8..a6decf28caf6 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -3851,7 +3851,6 @@ py_test(
 # --------------------------------------------------------------------
 # examples/rl_trainer directory
 #
-# Tag: rlm
 # 
 # Description: These are RLlib tests for the new multi-gpu enabled 
 # training stack via RLTrainers.

From 8466fc8a2ec4eb9d7433a008c3731522deb4fb1c Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 31 Jan 2023 20:31:57 -0800
Subject: [PATCH 105/112] fixed as test flag

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py b/rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py
index 08172bcea1c3..319b0b432f83 100644
--- a/rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py
+++ b/rllib/examples/rl_trainer/multi_agent_cartpole_ppo.py
@@ -105,8 +105,9 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs):
         .resources(**scaling_config)
     )
 
+    stop_reward = args.stop_reward_per_agent * args.num_agents
     stop = {
-        "episode_reward_mean": args.stop_reward_per_agent * args.num_agents,
+        "episode_reward_mean": stop_reward,
         "timesteps_total": args.stop_timesteps,
         "training_iteration": args.stop_iters,
     }
@@ -118,5 +119,5 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs):
     ).fit()
 
     if args.as_test:
-        check_learning_achieved(results, args.stop_reward)
+        check_learning_achieved(results, stop_reward)
     ray.shutdown()

From d78f3d53d9d97797243f50199c0c893db0f398e4 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 6 Feb 2023 10:13:16 -0800
Subject: [PATCH 106/112] made the sync_weights equivalent to the
 implementation before this PR

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/evaluation/worker_set.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py
index 74af3218b47c..e5e7243df6d3 100644
--- a/rllib/evaluation/worker_set.py
+++ b/rllib/evaluation/worker_set.py
@@ -387,12 +387,15 @@ def sync_weights(
         global_vars: Optional[Dict[str, TensorType]] = None,
         timeout_seconds: Optional[int] = 0,
     ) -> None:
-        """Syncs model weights from the local worker to all remote workers.
+        """Syncs model weights from the given weight source to all remote workers.
+
+        Weight source can be either a (local) rollout worker or a trainer runner. It
+        should just implement a `get_weights` method.
 
         Args:
             policies: Optional list of PolicyIDs to sync weights for.
                 If None (default), sync weights to/from all policies.
-            from_worker_or_trainer: Optional local RolloutWorker instance or
+            from_worker_or_trainer: Optional (local) RolloutWorker instance or
                 TrainerRunner instance to sync from. If None (default),
                 sync from this WorkerSet's local worker.
             to_worker_indices: Optional list of worker indices to sync the
@@ -412,10 +415,15 @@ def sync_weights(
 
         # Only sync if we have remote workers or `from_worker_or_trainer` is provided.
         weights = None
-        worker_or_trainer = None
         if self.num_remote_workers() or from_worker_or_trainer is not None:
-            worker_or_trainer = from_worker_or_trainer or self.local_worker()
-            weights = worker_or_trainer.get_weights(policies)
+            weights_src = from_worker_or_trainer or self.local_worker()
+
+            if weights_src is None:
+                raise ValueError(
+                    "`from_worker_or_trainer` is None. In this case, workerset "
+                    "should have local_worker. But local_worker is also None."
+                )
+            weights = weights_src.get_weights(policies)
 
             def set_weight(w):
                 w.set_weights(weights, global_vars)
@@ -435,7 +443,7 @@ def set_weight(w):
         # If `from_worker` is provided, also sync to this WorkerSet's
         # local worker.
         if self.local_worker() is not None:
-            if worker_or_trainer is not None:
+            if from_worker_or_trainer is not None:
                 self.local_worker().set_weights(weights, global_vars=global_vars)
             # If `global_vars` is provided and local worker exists  -> Update its
             # global_vars.

From 7a68bb432d9dcdc18093d48c396789fe7d920d4c Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 6 Feb 2023 16:57:10 -0800
Subject: [PATCH 107/112] addressed jun's comments, created a
 minibatchCycleIterator

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/algorithm.py           |  2 +-
 rllib/algorithms/algorithm_config.py    |  7 +++
 rllib/algorithms/ppo/ppo.py             |  5 +-
 rllib/core/rl_trainer/trainer_runner.py | 33 ++-----------
 rllib/utils/minibatch_utils.py          | 61 +++++++++++++++++++++++++
 5 files changed, 76 insertions(+), 32 deletions(-)
 create mode 100644 rllib/utils/minibatch_utils.py

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 7eaf7a5e9a53..4f835a151969 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -2194,7 +2194,7 @@ def default_resource_request(
                 trainer_bundle = [
                     {
                         "CPU": cf.num_cpus_per_trainer_worker,
-                        "GPU": int(cf.num_gpus_per_trainer_worker > 0),
+                        "GPU": cf.num_gpus_per_trainer_worker,
                     }
                 ]
             else:
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 63f6541ff140..96e777b458b5 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -883,6 +883,13 @@ def validate(self) -> None:
             rl_module_class_path = self.get_default_rl_module_class()
             self.rl_module_class = _resolve_class_path(rl_module_class_path)
 
+        # make sure the resource requirements for trainer runner is valid
+        if self.num_trainer_workers == 0 and self.num_gpus_per_worker > 1:
+            raise ValueError(
+                "num_gpus_per_worker must be 0 (cpu) or 1 (gpu) when using local mode "
+                "(i.e. num_trainer_workers = 0)"
+            )
+
         # resolve rl_trainer class
         if self._enable_rl_trainer_api and self.rl_trainer_class is None:
             rl_trainer_class_path = self.get_default_rl_trainer_class()
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 22e4eaae9692..7108241fd68c 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -217,7 +217,7 @@ def training(
         if use_critic is not NotProvided:
             self.use_critic = use_critic
             # TODO (Kourosh) This is experimental. Set rl_trainer_hps parameters as
-            # well. Don't forget to remote .use_critic from algorithm config.
+            # well. Don't forget to remove .use_critic from algorithm config.
             self._rl_trainer_hps.use_critic = use_critic
         if use_gae is not NotProvided:
             self.use_gae = use_gae
@@ -415,6 +415,9 @@ def training_step(self) -> ResultDict:
             # subtract that to get the total set of pids to update.
             # TODO (Kourosh): We need to make a better design for the hierarchy of the
             # train results, so that all the policy ids end up in the same level.
+            # TODO (Kourosh): We should also not be using train_results as a message
+            # passing medium to infer whcih policies to update. We could use
+            # policies_to_train variable that is given by the user to infer this.
             policies_to_update = set(train_results["loss"].keys()) - {"total_loss"}
         else:
             policies_to_update = list(train_results.keys())
diff --git a/rllib/core/rl_trainer/trainer_runner.py b/rllib/core/rl_trainer/trainer_runner.py
index 7e1841b5cb6e..5278971942fa 100644
--- a/rllib/core/rl_trainer/trainer_runner.py
+++ b/rllib/core/rl_trainer/trainer_runner.py
@@ -7,8 +7,8 @@
 
 from ray.rllib.utils.typing import ResultDict
 from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.minibatch_utils import MiniBatchCyclicIterator
 from ray.rllib.core.rl_trainer.reduce_result_dict_fn import _reduce_mean_results
-from ray.rllib.policy.sample_batch import concat_samples
 from ray.rllib.core.rl_module.rl_module import (
     RLModule,
     ModuleID,
@@ -131,39 +131,12 @@ def fit(
             A dictionary of results summarizing the statistics of the updates.
         """
 
-        start = {mid: 0 for mid in batch.policy_batches.keys()}
-        num_covered_epochs = {mid: 0 for mid in batch.policy_batches.keys()}
-        results = []
         # TODO (Kourosh): One data transfer is probably better than many for each mini
         # batch. How should we do this?
         # loop until the number of passes through all modules batches reaches the
         # num_iters
-        while min(num_covered_epochs.values()) < num_iters:
-            minibatch = {}
-            for module_id, module_batch in batch.policy_batches.items():
-                s = start[module_id]  # start
-                e = s + minibatch_size  # end
-
-                samples_to_concat = []
-                # cycle through the batch until we have enough samples
-                while e >= len(module_batch):
-                    samples_to_concat.append(module_batch[s:])
-                    e = minibatch_size - len(module_batch[s:])
-                    s = 0
-                    num_covered_epochs[module_id] += 1
-
-                samples_to_concat.append(module_batch[s:e])
-
-                # concatenate all the samples, we should have minibatch_size of sample
-                # after this step
-                minibatch[module_id] = concat_samples(samples_to_concat)
-                # roll miniback to zero when we reach the end of the batch
-                start[module_id] = e
-
-            # TODO (Kourosh): len(batch) is not correct here. However it's also not
-            # clear what the correct value should be. Since training does not depend on
-            # this it will be fine for now.
-            minibatch = MultiAgentBatch(minibatch, len(batch))
+        results = []
+        for minibatch in MiniBatchCyclicIterator(batch, minibatch_size, num_iters):
             results.append(self.update(minibatch, reduce_fn=reduce_fn))
 
         # return the average of the results using tree map
diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py
new file mode 100644
index 000000000000..19b732ffdf63
--- /dev/null
+++ b/rllib/utils/minibatch_utils.py
@@ -0,0 +1,61 @@
+from ray.rllib.policy.sample_batch import MultiAgentBatch, concat_samples
+
+
+class MiniBatchCyclicIterator:
+    """This implements a simple multi-agent minibatch iterator.
+
+
+    This iterator will split the input multi-agent batch into minibatches where the
+    size of batch for each module_id (aka policy_id) is equal to minibatch_size. If the
+    input batch is smaller than minibatch_size, then the iterator will cycle through
+    the batch until it has covered num_iters epochs.
+
+    Args:
+        batch: The input multi-agent batch.
+        minibatch_size: The size of the minibatch for each module_id.
+        num_iters: The number of epochs to cover. If the input batch is smaller than
+            minibatch_size, then the iterator will cycle through the batch until it
+            has covered num_iters epochs.
+    """
+
+    def __init__(
+        self, batch: MultiAgentBatch, minibatch_size: int, num_iters: int = 1
+    ) -> None:
+        self._batch = batch
+        self._minibatch_size = minibatch_size
+        self._num_iters = num_iters
+
+        # mapping from module_id to the start index of the batch
+        self._start = {mid: 0 for mid in batch.policy_batches.keys()}
+        # mapping from module_id to the number of epochs covered for each module_id
+        self._num_covered_epochs = {mid: 0 for mid in batch.policy_batches.keys()}
+
+    def __iter__(self):
+
+        while min(self._num_covered_epochs.values()) < self._num_iters:
+            minibatch = {}
+            for module_id, module_batch in self._batch.policy_batches.items():
+                s = self._start[module_id]  # start
+                e = s + self._minibatch_size  # end
+
+                samples_to_concat = []
+                # cycle through the batch until we have enough samples
+                while e >= len(module_batch):
+                    samples_to_concat.append(module_batch[s:])
+                    e = self._minibatch_size - len(module_batch[s:])
+                    s = 0
+                    self._num_covered_epochs[module_id] += 1
+
+                samples_to_concat.append(module_batch[s:e])
+
+                # concatenate all the samples, we should have minibatch_size of sample
+                # after this step
+                minibatch[module_id] = concat_samples(samples_to_concat)
+                # roll miniback to zero when we reach the end of the batch
+                self._start[module_id] = e
+
+            # TODO (Kourosh): len(batch) is not correct here. However it's also not
+            # clear what the correct value should be. Since training does not depend on
+            # this it will be fine for now.
+            minibatch = MultiAgentBatch(minibatch, len(self._batch))
+            yield minibatch

From edbf081e0ca2e9220fef2a44156d5666ca942a57 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 6 Feb 2023 18:05:00 -0800
Subject: [PATCH 108/112] added annotations

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/utils/minibatch_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py
index 19b732ffdf63..5b21b57eacee 100644
--- a/rllib/utils/minibatch_utils.py
+++ b/rllib/utils/minibatch_utils.py
@@ -1,6 +1,8 @@
 from ray.rllib.policy.sample_batch import MultiAgentBatch, concat_samples
+from ray.rllib.utils.annotations import DeveloperAPI
 
 
+@DeveloperAPI
 class MiniBatchCyclicIterator:
     """This implements a simple multi-agent minibatch iterator.
 

From 4c8ce18102c8b18d7e823e186439e07a8c44bbea Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Mon, 6 Feb 2023 22:46:20 -0800
Subject: [PATCH 109/112] empty


From 9f29038f149e669d831a6c7306c8445b03f08edc Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 7 Feb 2023 09:01:46 -0800
Subject: [PATCH 110/112] empty

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>

From b1d3f63cc8f0a82d9473d9289ce7ffa1bcf81aa1 Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 7 Feb 2023 12:43:51 -0800
Subject: [PATCH 111/112] empty

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>

From 653bf3d9ae82b9899c9b174bf0db95cff1423a9b Mon Sep 17 00:00:00 2001
From: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Date: Tue, 7 Feb 2023 22:42:58 -0800
Subject: [PATCH 112/112] fwd fix for the failing test

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
---
 rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
index a7eb642a6697..52d52596058f 100644
--- a/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
+++ b/rllib/algorithms/ppo/tests/test_ppo_rl_trainer.py
@@ -20,6 +20,10 @@
         [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]],
         dtype=np.float32,
     ),
+    SampleBatch.NEXT_OBS: np.array(
+        [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]],
+        dtype=np.float32,
+    ),
     SampleBatch.ACTIONS: np.array([0, 1, 1]),
     SampleBatch.PREV_ACTIONS: np.array([0, 1, 1]),
     SampleBatch.REWARDS: np.array([1.0, -1.0, 0.5], dtype=np.float32),
@@ -57,7 +61,7 @@ def test_loss(self):
             .training(
                 gamma=0.99,
                 model=dict(
-                    fcnet_hiddens=[10],
+                    fcnet_hiddens=[10, 10],
                     fcnet_activation="linear",
                     vf_share_layers=False,
                 ),