diff --git a/gym-unity/gym_unity/tests/test_gym.py b/gym-unity/gym_unity/tests/test_gym.py index c1bd624cb4..c86ce3dee7 100644 --- a/gym-unity/gym_unity/tests/test_gym.py +++ b/gym-unity/gym_unity/tests/test_gym.py @@ -246,7 +246,12 @@ def create_mock_vector_steps(specs, num_agents=1, number_visual_observations=0): ] * number_visual_observations rewards = np.array(num_agents * [1.0]) agents = np.array(range(0, num_agents)) - return DecisionSteps(obs, rewards, agents, None), TerminalSteps.empty(specs) + group_id = np.array(num_agents * [0]) + group_rewards = np.array(num_agents * [0.0]) + return ( + DecisionSteps(obs, rewards, agents, None, group_id, group_rewards), + TerminalSteps.empty(specs), + ) def setup_mock_unityenvironment(mock_env, mock_spec, mock_decision, mock_termination): diff --git a/ml-agents-envs/mlagents_envs/base_env.py b/ml-agents-envs/mlagents_envs/base_env.py index f2d73ecf24..2debe995e0 100644 --- a/ml-agents-envs/mlagents_envs/base_env.py +++ b/ml-agents-envs/mlagents_envs/base_env.py @@ -54,10 +54,10 @@ class DecisionStep(NamedTuple): obs: List[np.ndarray] reward: float - group_reward: float agent_id: AgentId action_mask: Optional[List[np.ndarray]] group_id: int + group_reward: float class DecisionSteps(Mapping): @@ -83,13 +83,13 @@ class DecisionSteps(Mapping): this simulation step. """ - def __init__(self, obs, reward, group_reward, agent_id, action_mask, group_id): + def __init__(self, obs, reward, agent_id, action_mask, group_id, group_reward): self.obs: List[np.ndarray] = obs self.reward: np.ndarray = reward - self.group_reward: np.ndarray = group_reward self.agent_id: np.ndarray = agent_id self.action_mask: Optional[List[np.ndarray]] = action_mask self.group_id: np.ndarray = group_id + self.group_reward: np.ndarray = group_reward self._agent_id_to_index: Optional[Dict[AgentId, int]] = None @property @@ -128,10 +128,10 @@ def __getitem__(self, agent_id: AgentId) -> DecisionStep: return DecisionStep( obs=agent_obs, reward=self.reward[agent_index], - group_reward=self.group_reward[agent_index], agent_id=agent_id, action_mask=agent_mask, group_id=group_id, + group_reward=self.group_reward[agent_index], ) def __iter__(self) -> Iterator[Any]: @@ -149,10 +149,10 @@ def empty(spec: "BehaviorSpec") -> "DecisionSteps": return DecisionSteps( obs=obs, reward=np.zeros(0, dtype=np.float32), - group_reward=np.zeros(0, dtype=np.float32), agent_id=np.zeros(0, dtype=np.int32), action_mask=None, group_id=np.zeros(0, dtype=np.int32), + group_reward=np.zeros(0, dtype=np.float32), ) @@ -170,10 +170,10 @@ class TerminalStep(NamedTuple): obs: List[np.ndarray] reward: float - group_reward: float interrupted: bool agent_id: AgentId group_id: int + group_reward: float class TerminalSteps(Mapping): @@ -194,13 +194,13 @@ class TerminalSteps(Mapping): across simulation steps. """ - def __init__(self, obs, reward, group_reward, interrupted, agent_id, group_id): + def __init__(self, obs, reward, interrupted, agent_id, group_id, group_reward): self.obs: List[np.ndarray] = obs self.reward: np.ndarray = reward - self.group_reward: np.ndarray = group_reward self.interrupted: np.ndarray = interrupted self.agent_id: np.ndarray = agent_id self.group_id: np.ndarray = group_id + self.group_reward: np.ndarray = group_reward self._agent_id_to_index: Optional[Dict[AgentId, int]] = None @property @@ -235,10 +235,10 @@ def __getitem__(self, agent_id: AgentId) -> TerminalStep: return TerminalStep( obs=agent_obs, reward=self.reward[agent_index], - group_reward=self.group_reward[agent_index], interrupted=self.interrupted[agent_index], agent_id=agent_id, group_id=group_id, + group_reward=self.group_reward[agent_index], ) def __iter__(self) -> Iterator[Any]: @@ -256,10 +256,10 @@ def empty(spec: "BehaviorSpec") -> "TerminalSteps": return TerminalSteps( obs=obs, reward=np.zeros(0, dtype=np.float32), - group_reward=np.zeros(0, dtype=np.float32), interrupted=np.zeros(0, dtype=np.bool), agent_id=np.zeros(0, dtype=np.int32), group_id=np.zeros(0, dtype=np.int32), + group_reward=np.zeros(0, dtype=np.float32), ) diff --git a/ml-agents-envs/mlagents_envs/rpc_utils.py b/ml-agents-envs/mlagents_envs/rpc_utils.py index 4f04f7e8ca..c5415b2a86 100644 --- a/ml-agents-envs/mlagents_envs/rpc_utils.py +++ b/ml-agents-envs/mlagents_envs/rpc_utils.py @@ -366,18 +366,18 @@ def steps_from_proto( DecisionSteps( decision_obs_list, decision_rewards, - decision_group_rewards, decision_agent_id, action_mask, decision_group_id, + decision_group_rewards, ), TerminalSteps( terminal_obs_list, terminal_rewards, - terminal_group_rewards, max_step, terminal_agent_id, terminal_group_id, + terminal_group_rewards, ), ) diff --git a/ml-agents-envs/mlagents_envs/tests/test_steps.py b/ml-agents-envs/mlagents_envs/tests/test_steps.py index f23401f232..0160380c8c 100644 --- a/ml-agents-envs/mlagents_envs/tests/test_steps.py +++ b/ml-agents-envs/mlagents_envs/tests/test_steps.py @@ -16,6 +16,8 @@ def test_decision_steps(): reward=np.array(range(3), dtype=np.float32), agent_id=np.array(range(10, 13), dtype=np.int32), action_mask=[np.zeros((3, 4), dtype=np.bool)], + group_id=np.array(range(3), dtype=np.int32), + group_reward=np.array(range(3), dtype=np.float32), ) assert ds.agent_id_to_index[10] == 0 @@ -51,6 +53,8 @@ def test_terminal_steps(): reward=np.array(range(3), dtype=np.float32), agent_id=np.array(range(10, 13), dtype=np.int32), interrupted=np.array([1, 0, 1], dtype=np.bool), + group_id=np.array(range(3), dtype=np.int32), + group_reward=np.array(range(3), dtype=np.float32), ) assert ts.agent_id_to_index[10] == 0 diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py index b22f6fd89e..c13d299095 100644 --- a/ml-agents/mlagents/trainers/tests/mock_brain.py +++ b/ml-agents/mlagents/trainers/tests/mock_brain.py @@ -43,15 +43,21 @@ def create_mock_steps( reward = np.array(num_agents * [1.0], dtype=np.float32) interrupted = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) + group_id = np.array(num_agents * [0], dtype=np.int32) + group_reward = np.array(num_agents * [0.0], dtype=np.float32) behavior_spec = BehaviorSpec(observation_specs, action_spec) if done: return ( DecisionSteps.empty(behavior_spec), - TerminalSteps(obs_list, reward, interrupted, agent_id), + TerminalSteps( + obs_list, reward, interrupted, agent_id, group_id, group_reward + ), ) else: return ( - DecisionSteps(obs_list, reward, agent_id, action_mask), + DecisionSteps( + obs_list, reward, agent_id, action_mask, group_id, group_reward + ), TerminalSteps.empty(behavior_spec), ) diff --git a/ml-agents/mlagents/trainers/tests/simple_test_envs.py b/ml-agents/mlagents/trainers/tests/simple_test_envs.py index e7f44b0f56..4f834626d1 100644 --- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py +++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py @@ -165,13 +165,17 @@ def _reset_agent(self, name): self.agent_id[name] = self.agent_id[name] + 1 def _make_batched_step( - self, name: str, done: bool, reward: float + self, name: str, done: bool, reward: float, group_reward: float ) -> Tuple[DecisionSteps, TerminalSteps]: m_vector_obs = self._make_obs(self.goal[name]) m_reward = np.array([reward], dtype=np.float32) m_agent_id = np.array([self.agent_id[name]], dtype=np.int32) + m_group_id = np.array([0], dtype=np.int32) + m_group_reward = np.array([group_reward], dtype=np.float32) action_mask = self._generate_mask() - decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask) + decision_step = DecisionSteps( + m_vector_obs, m_reward, m_agent_id, action_mask, m_group_id, m_group_reward + ) terminal_step = TerminalSteps.empty(self.behavior_spec) if done: self.final_rewards[name].append(self.rewards[name]) @@ -182,24 +186,45 @@ def _make_batched_step( new_done, new_agent_id, new_action_mask, + new_group_id, + new_group_reward, ) = self._construct_reset_step(name) decision_step = DecisionSteps( - new_vector_obs, new_reward, new_agent_id, new_action_mask + new_vector_obs, + new_reward, + new_agent_id, + new_action_mask, + new_group_id, + new_group_reward, ) terminal_step = TerminalSteps( - m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id + m_vector_obs, + m_reward, + np.array([False], dtype=np.bool), + m_agent_id, + m_group_id, + m_group_reward, ) return (decision_step, terminal_step) def _construct_reset_step( self, name: str - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: new_reward = np.array([0.0], dtype=np.float32) new_done = np.array([False], dtype=np.bool) new_agent_id = np.array([self.agent_id[name]], dtype=np.int32) new_action_mask = self._generate_mask() - return new_reward, new_done, new_agent_id, new_action_mask + new_group_id = np.array([0], dtype=np.int32) + new_group_reward = np.array([0.0], dtype=np.float32) + return ( + new_reward, + new_done, + new_agent_id, + new_action_mask, + new_group_id, + new_group_reward, + ) def step(self) -> None: assert all(action is not None for action in self.action.values()) @@ -208,12 +233,12 @@ def step(self) -> None: done = self._take_action(name) reward = self._compute_reward(name, done) self.rewards[name] += reward - self.step_result[name] = self._make_batched_step(name, done, reward) + self.step_result[name] = self._make_batched_step(name, done, reward, 0.0) def reset(self) -> None: # type: ignore for name in self.names: self._reset_agent(name) - self.step_result[name] = self._make_batched_step(name, False, 0.0) + self.step_result[name] = self._make_batched_step(name, False, 0.0, 0.0) @property def reset_parameters(self) -> Dict[str, str]: @@ -231,7 +256,7 @@ def __init__(self, brain_names, action_sizes=(1, 0), step_size=0.2): self.num_show_steps = 2 def _make_batched_step( - self, name: str, done: bool, reward: float + self, name: str, done: bool, reward: float, group_reward: float ) -> Tuple[DecisionSteps, TerminalSteps]: recurrent_obs_val = ( self.goal[name] if self.step_count[name] <= self.num_show_steps else 0 @@ -239,6 +264,8 @@ def _make_batched_step( m_vector_obs = self._make_obs(recurrent_obs_val) m_reward = np.array([reward], dtype=np.float32) m_agent_id = np.array([self.agent_id[name]], dtype=np.int32) + m_group_id = np.array([0], dtype=np.int32) + m_group_reward = np.array([group_reward], dtype=np.float32) action_mask = self._generate_mask() decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask) terminal_step = TerminalSteps.empty(self.behavior_spec) @@ -254,12 +281,24 @@ def _make_batched_step( new_done, new_agent_id, new_action_mask, + new_group_id, + new_group_reward, ) = self._construct_reset_step(name) decision_step = DecisionSteps( - new_vector_obs, new_reward, new_agent_id, new_action_mask + new_vector_obs, + new_reward, + new_agent_id, + new_action_mask, + new_group_id, + new_group_reward, ) terminal_step = TerminalSteps( - m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id + m_vector_obs, + m_reward, + np.array([False], dtype=np.bool), + m_agent_id, + m_group_id, + m_group_reward, ) return (decision_step, terminal_step)