[RLlib] Fix reward collection for OpenSpiel games (#31156)

Jun Gong · kouroshHakha · web-flow · commit 4267ac222bf4 · 2022-12-17T14:38:14.000-08:00
Signed-off-by: Jun Gong &lt;jungong@anyscale.com&gt;
Signed-off-by: Kourosh Hakhamaneshi &lt;kourosh@anyscale.com&gt;
Co-authored-by: Kourosh Hakhamaneshi &lt;kourosh@anyscale.com&gt;
diff --git a/rllib/evaluation/env_runner_v2.py b/rllib/evaluation/env_runner_v2.py
@@ -543,13 +543,20 @@ def _process_observations(
                     # Create a fake observation by sampling the original env
                     # observation space.
                     obs_space = get_original_space(policy.observation_space)
+                    # Although there is no obs for this agent, there may be
+                    # good rewards and info dicts for it.
+                    # This is the case for e.g. OpenSpiel games, where a reward
+                    # is only earned with the last step, but the obs for that
+                    # step is {}.
+                    reward = rewards[env_id].get(agent_id, 0.0)
+                    info = infos[env_id].get(agent_id, {})
                     values_dict = {
                         SampleBatch.T: episode.length,
                         SampleBatch.ENV_ID: env_id,
                         SampleBatch.AGENT_INDEX: episode.agent_index(agent_id),
-                        SampleBatch.REWARDS: 0.0,
+                        SampleBatch.REWARDS: reward,
                         SampleBatch.DONES: True,
-                        SampleBatch.INFOS: {},
+                        SampleBatch.INFOS: info,
                         SampleBatch.NEXT_OBS: obs_space.sample(),
                     }
 
diff --git a/rllib/evaluation/tests/test_env_runner_v2.py b/rllib/evaluation/tests/test_env_runner_v2.py
@@ -1,17 +1,20 @@
 import unittest
+import numpy as np
 
 import ray
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
 from ray.rllib.algorithms.ppo import PPO, PPOConfig
 from ray.rllib.connectors.connector import ActionConnector, ConnectorContext
 from ray.rllib.evaluation.metrics import RolloutMetrics
 from ray.rllib.examples.env.debug_counter_env import DebugCounterEnv
-from ray.rllib.examples.env.multi_agent import BasicMultiAgent
+from ray.rllib.examples.env.multi_agent import BasicMultiAgent, GuessTheNumberGame
 from ray.rllib.examples.policy.random_policy import RandomPolicy
 from ray.rllib.policy.policy import PolicySpec
 from ray.tune import register_env
 from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch
 
+from ray.rllib.utils.test_utils import check
+
 
 register_env("basic_multiagent", lambda _: BasicMultiAgent(2))
 
@@ -92,6 +95,85 @@ def test_sample_batch_rollout_multi_agent_env(self):
         self.assertEqual(sample_batch.env_steps(), 200)
         self.assertEqual(sample_batch.agent_steps(), 400)
 
+    def test_guess_the_number_multi_agent(self):
+        """This test will test env runner in the game of GuessTheNumberGame.
+
+        The policies are chosen to be deterministic, so that we can test for an
+        expected reward. Agent 1 will always pick 1, and agent 2 will always guess that
+        the picked number is higher than 1. The game will end when the picked number is
+        1, and agent 1 will win. The reward will be 100 for winning, and 1 for each
+        step that the game is dragged on for. So the expected reward for agent 1 is 100
+        + 19 = 119. 19 is the number of steps that the game will last for agent 1
+        before it wins or loses.
+        """
+
+        register_env("env_under_test", lambda config: GuessTheNumberGame(config))
+
+        def mapping_fn(agent_id, *args, **kwargs):
+            return "pol1" if agent_id == 0 else "pol2"
+
+        class PickOne(RandomPolicy):
+            """This policy will always pick 1."""
+
+            def compute_actions(
+                self,
+                obs_batch,
+                state_batches=None,
+                prev_action_batch=None,
+                prev_reward_batch=None,
+                **kwargs
+            ):
+                return [np.array([2, 1])] * len(obs_batch), [], {}
+
+        class GuessHigherThanOne(RandomPolicy):
+            """This policy will guess that the picked number is higher than 1."""
+
+            def compute_actions(
+                self,
+                obs_batch,
+                state_batches=None,
+                prev_action_batch=None,
+                prev_reward_batch=None,
+                **kwargs
+            ):
+                return [np.array([1, 1])] * len(obs_batch), [], {}
+
+        config = (
+            PPOConfig()
+            .framework("torch")
+            .environment(disable_env_checking=True, env="env_under_test")
+            .rollouts(
+                num_envs_per_worker=1,
+                num_rollout_workers=0,
+                # Enable EnvRunnerV2.
+                enable_connectors=True,
+                rollout_fragment_length=100,
+            )
+            .multi_agent(
+                # this makes it independent of neural networks
+                policies={
+                    "pol1": PolicySpec(policy_class=PickOne),
+                    "pol2": PolicySpec(policy_class=GuessHigherThanOne),
+                },
+                policy_mapping_fn=mapping_fn,
+            )
+            .debugging(seed=42)
+        )
+
+        algo = PPO(config, env="env_under_test")
+
+        rollout_worker = algo.workers.local_worker()
+        sample_batch = rollout_worker.sample()
+        pol1_batch = sample_batch.policy_batches["pol1"]
+
+        # reward should be 100 (for winning) + 19 (for dragging the game for 19 steps)
+        check(pol1_batch["rewards"], 119 * np.ones_like(pol1_batch["rewards"]))
+        # check if pol1 only has one timestep of transition informatio per each episode
+        check(len(set(pol1_batch["eps_id"])), len(pol1_batch["eps_id"]))
+        # check if pol2 has 19 timesteps of transition information per each episode
+        pol2_batch = sample_batch.policy_batches["pol2"]
+        check(len(set(pol2_batch["eps_id"])) * 19, len(pol2_batch["eps_id"]))
+
     def test_inference_batches_are_grouped_by_policy(self):
         # Create 2 policies that have different inference batch shapes.
         class RandomPolicyOne(RandomPolicy):
diff --git a/rllib/examples/env/multi_agent.py b/rllib/examples/env/multi_agent.py
@@ -289,6 +289,86 @@ def step(self, action_dict):
         return obs, rew, done, info
 
 
+class GuessTheNumberGame(MultiAgentEnv):
+    """
+    We have two players, 0 and 1. Agent 0 has to pick a number between 0, MAX-1
+    at reset. Agent 1 has to guess the number by asking N questions of whether
+    of the form of "a <number> is higher|lower|equal to the picked number. The
+    action space is MultiDiscrete [3, MAX]. For the first index 0 means lower,
+    1 means higher and 2 means equal. The environment answers with yes (1) or
+    no (0) on the reward function. Every time step that agent 1 wastes agent 0
+    gets a reward of 1. After N steps the game is terminated. If agent 1
+    guesses the number correctly, it gets a reward of 100 points, otherwise it
+    gets a reward of 0. On the other hand if agent 0 wins they win 100 points.
+    The optimal policy controlling agent 1 should converge to a binary search
+    strategy.
+    """
+
+    MAX_NUMBER = 3
+    MAX_STEPS = 20
+
+    def __init__(self, config):
+        super().__init__()
+        self._agent_ids = {0, 1}
+
+        self.max_number = config.get("max_number", self.MAX_NUMBER)
+        self.max_steps = config.get("max_steps", self.MAX_STEPS)
+
+        self._number = None
+        self.observation_space = gym.spaces.Discrete(2)
+        self.action_space = gym.spaces.MultiDiscrete([3, self.max_number])
+
+    def reset(self):
+        self._step = 0
+        self._number = None
+        # agent 0 has to pick a number. So the returned obs does not matter.
+        return {0: 0}
+
+    def step(self, action_dict):
+        # get agent 0's action
+        agent_0_action = action_dict.get(0)
+
+        if agent_0_action is not None:
+            # ignore the first part of the action and look at the number
+            self._number = agent_0_action[1]
+            # next obs should tell agent 1 to start guessing.
+            # the returned reward and dones should be on agent 0 who picked a
+            # number.
+            return {1: 0}, {0: 0}, {0: False, "__all__": False}, {}
+
+        if self._number is None:
+            raise ValueError(
+                "No number is selected by agent 0. Have you restarted "
+                "the environment?"
+            )
+
+        # get agent 1's action
+        direction, number = action_dict.get(1)
+        info = {}
+        # always the same, we don't need agent 0 to act ever again, agent 1 should keep
+        # guessing.
+        obs = {1: 0}
+        guessed_correctly = False
+        # everytime agent 1 does not guess correctly agent 0 gets a reward of 1.
+        if direction == 0:  # lower
+            reward = {1: int(number > self._number), 0: 1}
+            done = {1: False, "__all__": False}
+        elif direction == 1:  # higher
+            reward = {1: int(number < self._number), 0: 1}
+            done = {1: False, "__all__": False}
+        else:  # equal
+            guessed_correctly = number == self._number
+            reward = {1: guessed_correctly * 100, 0: guessed_correctly * -100}
+            done = {1: guessed_correctly, "__all__": guessed_correctly}
+
+        self._step += 1
+        if self._step >= self.max_steps:  # max number of steps episode is over
+            done["__all__"] = True
+            if not guessed_correctly:
+                reward[0] = 100  # agent 0 wins
+        return obs, reward, done, info
+
+
 MultiAgentCartPole = make_multi_agent("CartPole-v1")
 MultiAgentMountainCar = make_multi_agent("MountainCarContinuous-v0")
 MultiAgentPendulum = make_multi_agent("Pendulum-v1")