[RLlib] - Fix APPO RLModule inference-only problems. (#45111)

ray-project · May 3, 2024 · 45d5640 · 45d5640
1 parent 6ab48be
commit 45d5640
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 22 deletions.
diff --git a/rllib/algorithms/appo/appo_rl_module.py b/rllib/algorithms/appo/appo_rl_module.py
@@ -0,0 +1,26 @@
+"""
+This file holds framework-agnostic components for APPO's RLModules.
+"""
+
+import abc
+
+from ray.rllib.algorithms.ppo.ppo_rl_module import PPORLModule
+from ray.rllib.utils.annotations import ExperimentalAPI
+
+# TODO (simon): Write a light-weight version of this class for the `TFRLModule`
+
+
+@ExperimentalAPI
+class APPORLModule(PPORLModule, abc.ABC):
+    def setup(self):
+        super().setup()
+
+        # If the module is not for inference only, set up the target networks.
+        if not self.inference_only:
+            catalog = self.config.get_catalog()
+            # Old pi and old encoder are the "target networks" that are used for
+            # the stabilization of the updates of the current pi and encoder.
+            self.old_pi = catalog.build_pi_head(framework=self.framework)
+            self.old_encoder = catalog.build_actor_critic_encoder(
+                framework=self.framework
+            )
diff --git a/rllib/algorithms/appo/tf/appo_tf_rl_module.py b/rllib/algorithms/appo/tf/appo_tf_rl_module.py
@@ -1,6 +1,7 @@
 from typing import List
 
 from ray.rllib.algorithms.appo.appo import OLD_ACTION_DIST_LOGITS_KEY
+from ray.rllib.algorithms.appo.appo_rl_module import APPORLModule
 from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.models.base import ACTOR
@@ -15,18 +16,17 @@
 _, tf, _ = try_import_tf()
 
 
-class APPOTfRLModule(PPOTfRLModule, RLModuleWithTargetNetworksInterface):
+class APPOTfRLModule(PPOTfRLModule, RLModuleWithTargetNetworksInterface, APPORLModule):
+    @override(PPOTfRLModule)
     def setup(self):
         super().setup()
-        catalog = self.config.get_catalog()
-        # old pi and old encoder are the "target networks" that are used for
-        # the stabilization of the updates of the current pi and encoder.
-        self.old_pi = catalog.build_pi_head(framework=self.framework)
-        self.old_encoder = catalog.build_actor_critic_encoder(framework=self.framework)
-        self.old_pi.set_weights(self.pi.get_weights())
-        self.old_encoder.set_weights(self.encoder.get_weights())
-        self.old_pi.trainable = False
-        self.old_encoder.trainable = False
+
+        # If the module is not for inference only, set up the target networks.
+        if not self.inference_only:
+            self.old_pi.set_weights(self.pi.get_weights())
+            self.old_encoder.set_weights(self.encoder.get_weights())
+            self.old_pi.trainable = False
+            self.old_encoder.trainable = False
 
     @override(RLModuleWithTargetNetworksInterface)
     def get_target_network_pairs(self):

diff --git a/rllib/algorithms/appo/torch/appo_torch_rl_module.py b/rllib/algorithms/appo/torch/appo_torch_rl_module.py
@@ -3,6 +3,7 @@
 from ray.rllib.algorithms.appo.appo import (
     OLD_ACTION_DIST_LOGITS_KEY,
 )
+from ray.rllib.algorithms.appo.appo_rl_module import APPORLModule
 from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.models.base import ACTOR
@@ -14,19 +15,20 @@
 from ray.rllib.utils.nested_dict import NestedDict
 
 
-class APPOTorchRLModule(PPOTorchRLModule, RLModuleWithTargetNetworksInterface):
+class APPOTorchRLModule(
+    PPOTorchRLModule, RLModuleWithTargetNetworksInterface, APPORLModule
+):
     @override(PPOTorchRLModule)
     def setup(self):
         super().setup()
-        catalog = self.config.get_catalog()
-        # Old pi and old encoder are the "target networks" that are used for
-        # the stabilization of the updates of the current pi and encoder.
-        self.old_pi = catalog.build_pi_head(framework=self.framework)
-        self.old_encoder = catalog.build_actor_critic_encoder(framework=self.framework)
-        self.old_pi.load_state_dict(self.pi.state_dict())
-        self.old_encoder.load_state_dict(self.encoder.state_dict())
-        self.old_pi.trainable = False
-        self.old_encoder.trainable = False
+
+        # If the module is not for inference only, update the target networks.
+        if not self.inference_only:
+            self.old_pi.load_state_dict(self.pi.state_dict())
+            self.old_encoder.load_state_dict(self.encoder.state_dict())
+            # We do not train the targets.
+            self.old_pi.requires_grad_(False)
+            self.old_encoder.requires_grad_(False)
 
     @override(RLModuleWithTargetNetworksInterface)
     def get_target_network_pairs(self):
@@ -47,3 +49,15 @@ def _forward_train(self, batch: NestedDict):
         old_action_dist_logits = self.old_pi(old_pi_inputs_encoded)
         outs[OLD_ACTION_DIST_LOGITS_KEY] = old_action_dist_logits
         return outs
+
+    @override(PPOTorchRLModule)
+    def _set_inference_only_state_dict_keys(self) -> None:
+        # Get the model_parameters from the `PPOTorchRLModule`.
+        super()._set_inference_only_state_dict_keys()
+        # Get the model_parameters.
+        state_dict = self.state_dict()
+        # Note, these keys are only known to the learner module. Furthermore,
+        # we want this to be run once during setup and not for each worker.
+        self._inference_only_state_dict_keys["unexpected_keys"].extend(
+            [name for name in state_dict if "old" in name]
+        )
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -141,7 +141,9 @@ def _set_inference_only_state_dict_keys(self) -> None:
         # Note, these keys are only known to the learner module. Furthermore,
         # we want this to be run once during setup and not for each worker.
         self._inference_only_state_dict_keys["unexpected_keys"] = [
-            name for name in state_dict if "vf" in name or "critic_encoder" in name
+            name
+            for name in state_dict
+            if "vf" in name or name.startswith("encoder.critic_encoder")
         ]
         # Do we use a separate encoder for the actor and critic?
         # if not self.config.model_config_dict.get("vf_share_layers", True):
@@ -153,7 +155,7 @@ def _set_inference_only_state_dict_keys(self) -> None:
             self._inference_only_state_dict_keys["expected_keys"] = {
                 name: name.replace("actor_encoder", "encoder")
                 for name in state_dict
-                if "actor_encoder" in name
+                if name.startswith("encoder.actor_encoder")
             }
 
     @override(TorchRLModule)