diff --git a/ml-agents/mlagents/trainers/poca/trainer.py b/ml-agents/mlagents/trainers/poca/trainer.py index 04775330c3..9ded567499 100644 --- a/ml-agents/mlagents/trainers/poca/trainer.py +++ b/ml-agents/mlagents/trainers/poca/trainer.py @@ -245,6 +245,15 @@ def _update_policy(self): self._clear_update_buffer() return True + def end_episode(self) -> None: + """ + A signal that the Episode has ended. The buffer must be reset. + Get only called when the academy resets. For POCA, we should + also zero out the group rewards. + """ + super().end_episode() + self.collected_group_rewards.clear() + def create_torch_policy( self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec ) -> TorchPolicy: diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 211a1a6aea..0de2b668b0 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -68,6 +68,9 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() + # Check if we used group rewards, warn if so. + self._warn_if_group_reward(agent_buffer_trajectory) + # Update the normalization if self.is_training: self.policy.update_normalization(agent_buffer_trajectory) diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index 869267f985..82213614d8 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -131,6 +131,8 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() + # Check if we used group rewards, warn if so. + self._warn_if_group_reward(agent_buffer_trajectory) # Update the normalization if self.is_training: diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py index f6d53c5209..cba79ecc4f 100644 --- a/ml-agents/mlagents/trainers/tests/mock_brain.py +++ b/ml-agents/mlagents/trainers/tests/mock_brain.py @@ -81,6 +81,8 @@ def make_fake_trajectory( max_step_complete: bool = False, memory_size: int = 10, num_other_agents_in_group: int = 0, + group_reward: float = 0.0, + is_terminal: bool = True, ) -> Trajectory: """ Makes a fake trajectory of length length. If max_step_complete, @@ -134,24 +136,29 @@ def make_fake_trajectory( interrupted=max_step, memory=memory, group_status=group_status, - group_reward=0, + group_reward=group_reward, ) steps_list.append(experience) obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) + last_group_status = [] + for _ in range(num_other_agents_in_group): + last_group_status.append( + AgentStatus(obs, reward, action, not max_step_complete and is_terminal) + ) last_experience = AgentExperience( obs=obs, reward=reward, - done=not max_step_complete, + done=not max_step_complete and is_terminal, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step_complete, memory=memory, - group_status=group_status, - group_reward=0, + group_status=last_group_status, + group_reward=group_reward, ) steps_list.append(last_experience) return Trajectory( diff --git a/ml-agents/mlagents/trainers/tests/torch/test_poca.py b/ml-agents/mlagents/trainers/tests/torch/test_poca.py index 84d0ad9611..2552234bb0 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_poca.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_poca.py @@ -1,9 +1,14 @@ +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers import pytest import numpy as np import attr +# Import to avoid circular import +from mlagents.trainers.trainer.trainer_factory import TrainerFactory # noqa F401 + from mlagents.trainers.poca.optimizer_torch import TorchPOCAOptimizer +from mlagents.trainers.poca.trainer import POCATrainer from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType from mlagents.trainers.policy.torch_policy import TorchPolicy @@ -12,19 +17,21 @@ from mlagents.trainers.tests.test_trajectory import make_fake_trajectory from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.tests.dummy_config import ( # noqa: F401 - ppo_dummy_config, + create_observation_specs_with_shapes, + poca_dummy_config, curiosity_dummy_config, gail_dummy_config, ) +from mlagents.trainers.agent_processor import AgentManagerQueue +from mlagents.trainers.settings import TrainerSettings -from mlagents_envs.base_env import ActionSpec +from mlagents_envs.base_env import ActionSpec, BehaviorSpec from mlagents.trainers.buffer import BufferKey, RewardSignalUtil @pytest.fixture def dummy_config(): - # poca has the same hyperparameters as ppo for now - return ppo_dummy_config() + return poca_dummy_config() VECTOR_ACTION_SPACE = 2 @@ -188,7 +195,7 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete): @pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) @pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) # We need to test this separately from test_reward_signals.py to ensure no interactions -def test_ppo_optimizer_update_curiosity( +def test_poca_optimizer_update_curiosity( dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate @@ -230,10 +237,10 @@ def test_ppo_optimizer_update_curiosity( # We need to test this separately from test_reward_signals.py to ensure no interactions -def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 +def test_poca_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config - config = ppo_dummy_config() + config = poca_dummy_config() optimizer = create_test_poca_optimizer( config, use_rnn=False, use_discrete=False, use_visual=False ) @@ -286,5 +293,46 @@ def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F8 ) +def test_poca_end_episode(): + name_behavior_id = "test_trainer" + trainer = POCATrainer( + name_behavior_id, + 10, + TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20), + True, + False, + 0, + "mock_model_path", + ) + behavior_spec = BehaviorSpec( + create_observation_specs_with_shapes([(1,)]), ActionSpec.create_discrete((2,)) + ) + parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id) + mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec) + trainer.add_policy(parsed_behavior_id, mock_policy) + trajectory_queue = AgentManagerQueue("testbrain") + policy_queue = AgentManagerQueue("testbrain") + trainer.subscribe_trajectory_queue(trajectory_queue) + trainer.publish_policy_queue(policy_queue) + time_horizon = 10 + trajectory = mb.make_fake_trajectory( + length=time_horizon, + observation_specs=behavior_spec.observation_specs, + max_step_complete=False, + action_spec=behavior_spec.action_spec, + num_other_agents_in_group=2, + group_reward=1.0, + is_terminal=False, + ) + trajectory_queue.put(trajectory) + trainer.advance() + # Test that some trajectoories have been injested + for reward in trainer.collected_group_rewards.values(): + assert reward == 10 + # Test end episode + trainer.end_episode() + assert len(trainer.collected_group_rewards.keys()) == 0 + + if __name__ == "__main__": pytest.main() diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index 6584504d78..7f0bede587 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -4,6 +4,7 @@ import abc import time import attr +import numpy as np from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod from mlagents.trainers.policy.checkpoint_manager import ( @@ -13,7 +14,7 @@ from mlagents_envs.logging_util import get_logger from mlagents_envs.timers import timed from mlagents.trainers.optimizer import Optimizer -from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.buffer import AgentBuffer, BufferKey from mlagents.trainers.trainer import Trainer from mlagents.trainers.torch.components.reward_providers.base_reward_provider import ( BaseRewardProvider, @@ -58,6 +59,7 @@ def __init__(self, *args, **kwargs): self.model_saver = self.create_model_saver( self.trainer_settings, self.artifact_path, self.load ) + self._has_warned_group_rewards = False def end_episode(self) -> None: """ @@ -256,6 +258,18 @@ def _maybe_save_model(self, step_after_process: int) -> None: if step_after_process >= self._next_save_step and self.get_step != 0: self._checkpoint() + def _warn_if_group_reward(self, buffer: AgentBuffer) -> None: + """ + Warn if the trainer receives a Group Reward but isn't a multiagent trainer (e.g. POCA). + """ + if not self._has_warned_group_rewards: + if not np.any(buffer[BufferKey.GROUP_REWARD]): + logger.warning( + "An agent recieved a Group Reward, but you are not using a multi-agent trainer. " + "Please use the POCA trainer for best results." + ) + self._has_warned_group_rewards = True + def advance(self) -> None: """ Steps the trainer, taking in trajectories and updates if ready.