handle multiple dones in a single step (#3700)

* handle multiple dones in a single step
Unity-Technologies · Mar 27, 2020 · cd01450 · cd01450
1 parent 26efe5f
commit cd01450
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 7 deletions.
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677)
  - Fixed an issue where switching models using `SetModel()` during training would use an excessive amount of memory. (#3664)
  - Environment subprocesses now close immediately on timeout or wrong API version. (#3679)
+ - Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700)
 
 
 ## [0.15.0-preview] - 2020-03-18

diff --git a/gym-unity/gym_unity/envs/__init__.py b/gym-unity/gym_unity/envs/__init__.py
@@ -364,9 +364,8 @@ def _check_agents(self, n_agents: int) -> None:
 
     def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
         n_extra_agents = step_result.n_agents() - self._n_agents
-        if n_extra_agents < 0 or n_extra_agents > self._n_agents:
+        if n_extra_agents < 0:
             # In this case, some Agents did not request a decision when expected
-            # or too many requested a decision
             raise UnityGymException(
                 "The number of agents in the scene does not match the expected number."
             )
@@ -386,6 +385,10 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
         # only cares about the ordering.
         for index, agent_id in enumerate(step_result.agent_id):
             if not self._previous_step_result.contains_agent(agent_id):
+                if step_result.done[index]:
+                    # If the Agent is already done (e.g. it ended its epsiode twice in one step)
+                    # Don't try to register it here.
+                    continue
                 # Register this agent, and get the reward of the previous agent that
                 # was in its index, so that we can return it to the gym.
                 last_reward = self.agent_mapper.register_new_agent_id(agent_id)
@@ -528,8 +531,12 @@ def mark_agent_done(self, agent_id: int, reward: float) -> None:
         """
         Declare the agent done with the corresponding final reward.
         """
-        gym_index = self._agent_id_to_gym_index.pop(agent_id)
-        self._done_agents_index_to_last_reward[gym_index] = reward
+        if agent_id in self._agent_id_to_gym_index:
+            gym_index = self._agent_id_to_gym_index.pop(agent_id)
+            self._done_agents_index_to_last_reward[gym_index] = reward
+        else:
+            # Agent was never registered in the first place (e.g. EndEpisode called multiple times)
+            pass
 
     def register_new_agent_id(self, agent_id: int) -> float:
         """
@@ -581,9 +588,13 @@ def set_initial_agents(self, agent_ids: List[int]) -> None:
         self._gym_id_order = list(agent_ids)
 
     def mark_agent_done(self, agent_id: int, reward: float) -> None:
-        gym_index = self._gym_id_order.index(agent_id)
-        self._done_agents_index_to_last_reward[gym_index] = reward
-        self._gym_id_order[gym_index] = -1
+        try:
+            gym_index = self._gym_id_order.index(agent_id)
+            self._done_agents_index_to_last_reward[gym_index] = reward
+            self._gym_id_order[gym_index] = -1
+        except ValueError:
+            # Agent was never registered in the first place (e.g. EndEpisode called multiple times)
+            pass
 
     def register_new_agent_id(self, agent_id: int) -> float:
         original_index = self._gym_id_order.index(-1)

diff --git a/gym-unity/gym_unity/tests/test_gym.py b/gym-unity/gym_unity/tests/test_gym.py
@@ -129,6 +129,50 @@ def test_sanitize_action_one_agent_done(mock_env):
         assert expected_agent_id == agent_id
 
 
+@mock.patch("gym_unity.envs.UnityEnvironment")
+def test_sanitize_action_new_agent_done(mock_env):
+    mock_spec = create_mock_group_spec(
+        vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
+    )
+    mock_step = create_mock_vector_step_result(num_agents=3)
+    mock_step.agent_id = np.array(range(5))
+    setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
+    env = UnityEnv(" ", use_visual=False, multiagent=True)
+
+    received_step_result = create_mock_vector_step_result(num_agents=7)
+    received_step_result.agent_id = np.array(range(7))
+    # agent #3 (id = 2) is Done
+    # so is the "new" agent (id = 5)
+    done = [False] * 7
+    done[2] = True
+    done[5] = True
+    received_step_result.done = np.array(done)
+    sanitized_result = env._sanitize_info(received_step_result)
+    for expected_agent_id, agent_id in zip([0, 1, 6, 3, 4], sanitized_result.agent_id):
+        assert expected_agent_id == agent_id
+
+
+@mock.patch("gym_unity.envs.UnityEnvironment")
+def test_sanitize_action_single_agent_multiple_done(mock_env):
+    mock_spec = create_mock_group_spec(
+        vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
+    )
+    mock_step = create_mock_vector_step_result(num_agents=1)
+    mock_step.agent_id = np.array(range(1))
+    setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
+    env = UnityEnv(" ", use_visual=False, multiagent=False)
+
+    received_step_result = create_mock_vector_step_result(num_agents=3)
+    received_step_result.agent_id = np.array(range(3))
+    # original agent (id = 0) is Done
+    # so is the "new" agent (id = 1)
+    done = [True, True, False]
+    received_step_result.done = np.array(done)
+    sanitized_result = env._sanitize_info(received_step_result)
+    for expected_agent_id, agent_id in zip([2], sanitized_result.agent_id):
+        assert expected_agent_id == agent_id
+
+
 # Helper methods
 
 
@@ -200,6 +244,10 @@ def test_agent_id_index_mapper(mapper_cls):
     mapper.mark_agent_done(1001, 42.0)
     mapper.mark_agent_done(1004, 1337.0)
 
+    # Make sure we can handle an unknown agent id being marked done.
+    # This can happen when an agent ends an episode on the same step it starts.
+    mapper.mark_agent_done(9999, -1.0)
+
     # Now add new agents, and get the rewards of the agent they replaced.
     old_reward1 = mapper.register_new_agent_id(2001)
     old_reward2 = mapper.register_new_agent_id(2002)