[RLlib] Add APPO/IMPALA multi-agent StatelessCartPole learning tests …

…to CI (+ fix some bugs related to this). (#47245)
ray-project · Aug 22, 2024 · 746f6b6 · 746f6b6
1 parent 4e226be
commit 746f6b6
Show file tree

Hide file tree

Showing 18 changed files with 413 additions and 222 deletions.
diff --git a/.buildkite/rllib.rayci.yml b/.buildkite/rllib.rayci.yml
@@ -83,7 +83,7 @@ steps:
     tags: 
       - rllib_gpu
       - gpu
-    parallelism: 4
+    parallelism: 5
     instance_type: gpu
     commands:
       - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib 
@@ -165,7 +165,7 @@ steps:
     tags: 
       - rllib_gpu
       - gpu
-    parallelism: 4
+    parallelism: 5
     instance_type: gpu-large
     commands:
       - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib

diff --git a/rllib/BUILD b/rllib/BUILD
@@ -184,23 +184,6 @@ py_test(
     srcs = ["tuned_examples/appo/cartpole_appo.py"],
     args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
 )
-# StatelessCartPole
-py_test(
-    name = "learning_tests_stateless_cartpole_appo",
-    main = "tuned_examples/appo/stateless_cartpole_appo.py",
-    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
-    size = "large",
-    srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
-)
-py_test(
-    name = "learning_tests_stateless_cartpole_appo_multi_gpu",
-    main = "tuned_examples/appo/stateless_cartpole_appo.py",
-    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
-    size = "large",
-    srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
-)
 # MultiAgentCartPole
 py_test(
     name = "learning_tests_multi_agent_cartpole_appo",
@@ -234,6 +217,72 @@ py_test(
     srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
     args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
 )
+# StatelessCartPole
+py_test(
+    name = "learning_tests_stateless_cartpole_appo",
+    main = "tuned_examples/appo/stateless_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_stateless_cartpole_appo_gpu",
+    main = "tuned_examples/appo/stateless_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_stateless_cartpole_appo_multi_cpu",
+    main = "tuned_examples/appo/stateless_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+py_test(
+    name = "learning_tests_stateless_cartpole_appo_multi_gpu",
+    main = "tuned_examples/appo/stateless_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+# MultiAgentStatelessCartPole
+py_test(
+    name = "learning_tests_multi_agent_stateless_cartpole_appo",
+    main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_multi_agent_stateless_cartpole_appo_gpu",
+    main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
+    size = "large",
+    srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_multi_agent_stateless_cartpole_appo_multi_cpu",
+    main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+py_test(
+    name = "learning_tests_multi_agent_stateless_cartpole_appo_multi_gpu",
+    main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
 
 #@OldAPIStack
 py_test(
@@ -462,23 +511,6 @@ py_test(
     srcs = ["tuned_examples/impala/cartpole_impala.py"],
     args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
 )
-# StatelessCartPole
-py_test(
-    name = "learning_tests_stateless_cartpole_impala",
-    main = "tuned_examples/impala/stateless_cartpole_impala.py",
-    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
-    size = "large",
-    srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
-)
-py_test(
-    name = "learning_tests_stateless_cartpole_impala_multi_gpu",
-    main = "tuned_examples/impala/stateless_cartpole_impala.py",
-    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
-    size = "large",
-    srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"],
-    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
-)
 # MultiAgentCartPole
 py_test(
     name = "learning_tests_multi_agent_cartpole_impala",
@@ -512,6 +544,40 @@ py_test(
     srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
     args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
 )
+# StatelessCartPole
+py_test(
+    name = "learning_tests_stateless_cartpole_impala",
+    main = "tuned_examples/impala/stateless_cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_stateless_cartpole_impala_multi_gpu",
+    main = "tuned_examples/impala/stateless_cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
+# MultiAgentStatelessCartPole
+py_test(
+    name = "learning_tests_multi_agent_stateless_cartpole_impala",
+    main = "tuned_examples/impala/multi_agent_stateless_cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
+    size = "large",
+    srcs = ["tuned_examples/impala/multi_agent_stateless_cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
+)
+py_test(
+    name = "learning_tests_multi_agent_stateless_cartpole_impala_multi_gpu",
+    main = "tuned_examples/impala/multi_agent_stateless_cartpole_impala.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
+    size = "large",
+    srcs = ["tuned_examples/impala/multi_agent_stateless_cartpole_impala.py"],
+    args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
+)
 
 #@OldAPIstack
 py_test(

diff --git a/rllib/connectors/common/add_states_from_episodes_to_batch.py b/rllib/connectors/common/add_states_from_episodes_to_batch.py
@@ -228,7 +228,7 @@ def __call__(
         # Also, let module-to-env pipeline know that we had added a single timestep
         # time rank to the data (to remove it again).
         if not self._as_learner_connector:
-            for column, column_data in data.copy().items():
+            for column in data.keys():
                 self.foreach_batch_item_change_in_place(
                     batch=data,
                     column=column,
@@ -250,11 +250,20 @@ def __call__(
             # Before adding STATE_IN to the `data`, zero-pad existing data and batch
             # into max_seq_len chunks.
             for column, column_data in data.copy().items():
+                # Do not zero-pad INFOS column.
+                if column == Columns.INFOS:
+                    continue
                 for key, item_list in column_data.items():
-                    if column != Columns.INFOS:
-                        column_data[key] = split_and_zero_pad_list(
-                            item_list, T=self.max_seq_len
-                        )
+                    # Multi-agent case AND RLModule is not stateful -> Do not zero-pad
+                    # for this model.
+                    assert isinstance(key, tuple)
+                    if len(key) == 3:
+                        eps_id, aid, mid = key
+                        if not rl_module[mid].is_stateful():
+                            continue
+                    column_data[key] = split_and_zero_pad_list(
+                        item_list, T=self.max_seq_len
+                    )
 
         for sa_episode in self.single_agent_episode_iterator(
             episodes,

diff --git a/rllib/connectors/env_to_module/mean_std_filter.py b/rllib/connectors/env_to_module/mean_std_filter.py
@@ -116,9 +116,16 @@ def __call__(
         # anymore to the original observations).
         for sa_episode in self.single_agent_episode_iterator(episodes):
             sa_obs = sa_episode.get_observations(indices=-1)
-            normalized_sa_obs = self._filters[sa_episode.agent_id](
-                sa_obs, update=self._update_stats
-            )
+            try:
+                normalized_sa_obs = self._filters[sa_episode.agent_id](
+                    sa_obs, update=self._update_stats
+                )
+            except KeyError:
+                raise KeyError(
+                    "KeyError trying to access a filter by agent ID "
+                    f"`{sa_episode.agent_id}`! You probably did NOT pass the "
+                    f"`multi_agent=True` flag into the `MeanStdFilter()` constructor. "
+                )
             sa_episode.set_observations(at_indices=-1, new_data=normalized_sa_obs)
             #  We set the Episode's observation space to ours so that we can safely
             #  set the last obs to the new value (without causing a space mismatch

diff --git a/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py b/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py
@@ -3,6 +3,7 @@
 from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.postprocessing.episodes import add_one_ts_to_episodes_and_truncate
 from ray.rllib.utils.typing import EpisodeType
@@ -101,10 +102,23 @@ def __call__(
         # batch: - - - - - - - T B0- - - - - R Bx- - - - R Bx
         # mask : t t t t t t t t f t t t t t t f t t t t t f
 
+        # TODO (sven): Same situation as in TODO below, but for multi-agent episode.
+        #  Maybe add a dedicated connector piece for this task?
+        # We extend the MultiAgentEpisode's ID by a running number here to make sure
+        # we treat each MAEpisode chunk as separate (for potentially upcoming v-trace
+        # and LSTM zero-padding) and don't mix data from different chunks.
+        if isinstance(episodes[0], MultiAgentEpisode):
+            for i, ma_episode in enumerate(episodes):
+                ma_episode.id_ += "_" + str(i)
+                # Also change the underlying single-agent episode's
+                # `multi_agent_episode_id` properties.
+                for sa_episode in ma_episode.agent_episodes.values():
+                    sa_episode.multi_agent_episode_id = ma_episode.id_
+
         for i, sa_episode in enumerate(
             self.single_agent_episode_iterator(episodes, agents_that_stepped_only=False)
         ):
-            # TODO (sven): This is a little bit of a hack: By expanding the Episode's
+            # TODO (sven): This is a little bit of a hack: By extending the Episode's
             #  ID, we make sure that each episode chunk in `episodes` is treated as a
             #  separate episode in the `self.add_n_batch_items` below. Some algos (e.g.
             #  APPO) may have >1 episode chunks from the same episode (same ID) in the

diff --git a/rllib/connectors/module_to_env/remove_single_ts_time_rank_from_batch.py b/rllib/connectors/module_to_env/remove_single_ts_time_rank_from_batch.py
@@ -50,9 +50,21 @@ def __call__(
         if shared_data is None or not shared_data.get("_added_single_ts_time_rank"):
             return data
 
-        data = tree.map_structure_with_path(
-            lambda p, s: s if Columns.STATE_OUT in p else np.squeeze(s, axis=0),
-            data,
-        )
+        def _remove_single_ts(item, eps_id, aid, mid):
+            # Only remove time-rank for modules that are statefule (only for those has
+            # a timerank been added).
+            if mid is None or rl_module[mid].is_stateful():
+                return tree.map_structure(lambda s: np.squeeze(s, axis=0), item)
+            return item
+
+        for column, column_data in data.copy().items():
+            # Skip state_out (doesn't have a time rank).
+            if column == Columns.STATE_OUT:
+                continue
+            self.foreach_batch_item_change_in_place(
+                data,
+                column=column,
+                func=_remove_single_ts,
+            )
 
         return data