[RLlib] Add systematic APPO learning tests to CI. Combinations of [1C…

…PU|2CPUs|1GPU|2GPUs] + [single-agent|multi-agent] (#46299)
ray-project · Jun 27, 2024 · 3138f73 · 3138f73
1 parent e40d489
commit 3138f73
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 10 deletions.
diff --git a/.buildkite/rllib.rayci.yml b/.buildkite/rllib.rayci.yml
@@ -107,7 +107,7 @@ steps:
     tags: 
       - rllib
       - gpu
-    parallelism: 2
+    parallelism: 3
     instance_type: gpu
     commands:
       - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib 
@@ -199,6 +199,7 @@ steps:
     tags: 
       - rllib
       - gpu
+    parallelism: 2
     instance_type: gpu-large
     commands:
       - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib 

diff --git a/rllib/BUILD b/rllib/BUILD
@@ -160,6 +160,30 @@ py_test(
     srcs = ["tuned_examples/appo/cartpole_appo.py"],
     args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
 )
+py_test(
+    name = "learning_tests_cartpole_appo_gpu",
+    main = "tuned_examples/appo/cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/appo/cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_cartpole_appo_multi_cpu",
+    main = "tuned_examples/appo/cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/appo/cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+py_test(
+    name = "learning_tests_cartpole_appo_multi_gpu",
+    main = "tuned_examples/appo/cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/appo/cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
 py_test(
     name = "learning_tests_multi_agent_cartpole_appo",
     main = "tuned_examples/appo/multi_agent_cartpole_appo.py",
@@ -168,6 +192,30 @@ py_test(
     srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
     args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1"]
 )
+py_test(
+    name = "learning_tests_multi_agent_cartpole_appo_gpu",
+    main = "tuned_examples/appo/multi_agent_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1", "--num-cpus=6"]
+)
+py_test(
+    name = "learning_tests_multi_agent_cartpole_appo_multi_cpu",
+    main = "tuned_examples/appo/multi_agent_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
+)
+py_test(
+    name = "learning_tests_multi_agent_cartpole_appo_multi_gpu",
+    main = "tuned_examples/appo/multi_agent_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_cartpole", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
+)
 
 #@OldAPIStack
 py_test(

diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py
@@ -100,8 +100,6 @@ def __init__(self, algo_class=None):
 
         # Override some of ImpalaConfig's default values with APPO-specific values.
         self.num_env_runners = 2
-        self.rollout_fragment_length = 50
-        self.train_batch_size = 500
         self.min_time_s_per_iteration = 10
         self.num_gpus = 0
         self.num_multi_gpu_tower_stacks = 1

diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
@@ -4,7 +4,7 @@
 import platform
 import queue
 import random
-from typing import Callable, List, Optional, Set, Tuple, Type, Union
+from typing import List, Optional, Set, Tuple, Type, Union
 
 import numpy as np
 import tree  # pip install dm_tree
@@ -28,6 +28,7 @@
 )
 from ray.rllib.utils.actors import create_colocated_actors
 from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning
 from ray.rllib.utils.metrics import (
     ALL_MODULES,
     ENV_RUNNER_RESULTS,
@@ -161,7 +162,6 @@ def __init__(self, algo_class=None):
         self.entropy_coeff_schedule = None
         self._separate_vf_optimizer = False  # @OldAPIstack
         self._lr_vf = 0.0005  # @OldAPIstack
-        self.after_train_step = None
 
         # Override some of AlgorithmConfig's default values with IMPALA-specific values.
         self.rollout_fragment_length = 50
@@ -218,7 +218,8 @@ def training(
         entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided,
         _separate_vf_optimizer: Optional[bool] = NotProvided,
         _lr_vf: Optional[float] = NotProvided,
-        after_train_step: Optional[Callable[[dict], None]] = NotProvided,
+        # Deprecated args.
+        after_train_step=DEPRECATED_VALUE,
         **kwargs,
     ) -> "ImpalaConfig":
         """Sets the training related configuration.
@@ -301,15 +302,16 @@ def training(
                 algorithms (APPO, IMPALA) on the old API stack.
             _lr_vf: If _separate_vf_optimizer is True, define separate learning rate
                 for the value network.
-            after_train_step: Callback for APPO to use to update KL, target network
-                periodically. The input to the callback is the learner fetches dict.
 
         Returns:
             This updated AlgorithmConfig object.
         """
         # Pass kwargs onto super's `training()` method.
         super().training(**kwargs)
 
+        if after_train_step != DEPRECATED_VALUE:
+            deprecation_warning(old="config.training(after_train_step=...)", error=True)
+
         if vtrace is not NotProvided:
             self.vtrace = vtrace
         if vtrace_clip_rho_threshold is not NotProvided:
@@ -368,8 +370,6 @@ def training(
             self._separate_vf_optimizer = _separate_vf_optimizer
         if _lr_vf is not NotProvided:
             self._lr_vf = _lr_vf
-        if after_train_step is not NotProvided:
-            self.after_train_step = after_train_step
         if minibatch_size is not NotProvided:
             self._minibatch_size = minibatch_size
 

diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py
@@ -189,6 +189,10 @@ class TorchDDPRLModuleWithTargetNetworksInterface(
     TorchDDPRLModule,
     RLModuleWithTargetNetworksInterface,
 ):
+    @override(RLModuleWithTargetNetworksInterface)
+    def get_target_network_pairs(self, *args, **kwargs):
+        return self.module.get_target_network_pairs(*args, **kwargs)
+
     @override(RLModuleWithTargetNetworksInterface)
     def sync_target_networks(self, *args, **kwargs):
         return self.module.sync_target_networks(*args, **kwargs)