From b98d28205bb334f29aaea6226a24700513308ece Mon Sep 17 00:00:00 2001 From: andrewcoh <54679309+andrewcoh@users.noreply.github.com> Date: Fri, 12 Mar 2021 09:52:31 -0500 Subject: [PATCH 1/3] [cherry-pick] Move PushBlockCollab config to poca directory (#5097) --- config/{ppo => poca}/PushBlockCollab.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename config/{ppo => poca}/PushBlockCollab.yaml (100%) diff --git a/config/ppo/PushBlockCollab.yaml b/config/poca/PushBlockCollab.yaml similarity index 100% rename from config/ppo/PushBlockCollab.yaml rename to config/poca/PushBlockCollab.yaml From e00e672d75a541e36baeb0f0bce5bd4adde61d74 Mon Sep 17 00:00:00 2001 From: andrewcoh <54679309+andrewcoh@users.noreply.github.com> Date: Fri, 12 Mar 2021 11:54:19 -0500 Subject: [PATCH 2/3] [cherry-pick] Fix ghost curriculum and make steps private (#5098) * use get step to determine curriculum * add to CHANGELOG * Make step in trainer private (#5099) Co-authored-by: Ervin T --- com.unity.ml-agents/CHANGELOG.md | 1 + ml-agents/mlagents/trainers/poca/trainer.py | 2 +- ml-agents/mlagents/trainers/ppo/trainer.py | 2 +- ml-agents/mlagents/trainers/sac/trainer.py | 18 +++++++++--------- .../mlagents/trainers/trainer/rl_trainer.py | 8 ++++---- ml-agents/mlagents/trainers/trainer/trainer.py | 4 ++-- .../mlagents/trainers/trainer_controller.py | 2 +- 7 files changed, 19 insertions(+), 18 deletions(-) diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index e008c6bbad..8b7afc94c0 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -41,6 +41,7 @@ and this project adheres to #### ml-agents / ml-agents-envs / gym-unity (Python) - An issue that caused `GAIL` to fail for environments where agents can terminate episodes by self-sacrifice has been fixed. (#4971) - Made the error message when observations of different shapes are sent to the trainer clearer. (#5030) +- An issue that prevented curriculums from incrementing with self-play has been fixed. (#5098) ## [1.8.1-preview] - 2021-03-08 ### Minor Changes diff --git a/ml-agents/mlagents/trainers/poca/trainer.py b/ml-agents/mlagents/trainers/poca/trainer.py index 5112d5fcd0..04775330c3 100644 --- a/ml-agents/mlagents/trainers/poca/trainer.py +++ b/ml-agents/mlagents/trainers/poca/trainer.py @@ -287,7 +287,7 @@ def add_policy( self.model_saver.initialize_or_load() # Needed to resume loads properly - self.step = policy.get_current_step() + self._step = policy.get_current_step() def get_policy(self, name_behavior_id: str) -> Policy: """ diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 807b7423da..211a1a6aea 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -263,7 +263,7 @@ def add_policy( self.model_saver.initialize_or_load() # Needed to resume loads properly - self.step = policy.get_current_step() + self._step = policy.get_current_step() def get_policy(self, name_behavior_id: str) -> Policy: """ diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index 640d9155bf..869267f985 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -67,7 +67,7 @@ def __init__( self.hyperparameters: SACSettings = cast( SACSettings, trainer_settings.hyperparameters ) - self.step = 0 + self._step = 0 # Don't divide by zero self.update_steps = 1 @@ -188,7 +188,7 @@ def _is_ready_update(self) -> bool: """ return ( self.update_buffer.num_experiences >= self.hyperparameters.batch_size - and self.step >= self.hyperparameters.buffer_init_steps + and self._step >= self.hyperparameters.buffer_init_steps ) @timed @@ -251,9 +251,9 @@ def _update_sac_policy(self) -> bool: batch_update_stats: Dict[str, list] = defaultdict(list) while ( - self.step - self.hyperparameters.buffer_init_steps + self._step - self.hyperparameters.buffer_init_steps ) / self.update_steps > self.steps_per_update: - logger.debug(f"Updating SAC policy at step {self.step}") + logger.debug(f"Updating SAC policy at step {self._step}") buffer = self.update_buffer if self.update_buffer.num_experiences >= self.hyperparameters.batch_size: sampled_minibatch = buffer.sample_mini_batch( @@ -305,12 +305,12 @@ def _update_reward_signals(self) -> None: ) batch_update_stats: Dict[str, list] = defaultdict(list) while ( - self.step - self.hyperparameters.buffer_init_steps + self._step - self.hyperparameters.buffer_init_steps ) / self.reward_signal_update_steps > self.reward_signal_steps_per_update: # Get minibatches for reward signal update if needed reward_signal_minibatches = {} for name in self.optimizer.reward_signals.keys(): - logger.debug(f"Updating {name} at step {self.step}") + logger.debug(f"Updating {name} at step {self._step}") if name != "extrinsic": reward_signal_minibatches[name] = buffer.sample_mini_batch( self.hyperparameters.batch_size, @@ -355,11 +355,11 @@ def add_policy( self.model_saver.initialize_or_load() # Needed to resume loads properly - self.step = policy.get_current_step() + self._step = policy.get_current_step() # Assume steps were updated at the correct ratio before - self.update_steps = int(max(1, self.step / self.steps_per_update)) + self.update_steps = int(max(1, self._step / self.steps_per_update)) self.reward_signal_update_steps = int( - max(1, self.step / self.reward_signal_steps_per_update) + max(1, self._step / self.reward_signal_steps_per_update) ) def get_policy(self, name_behavior_id: str) -> Policy: diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index 3ada3ef7b8..6584504d78 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -152,10 +152,10 @@ def _checkpoint(self) -> ModelCheckpoint: logger.warning( "Trainer has multiple policies, but default behavior only saves the first." ) - checkpoint_path = self.model_saver.save_checkpoint(self.brain_name, self.step) + checkpoint_path = self.model_saver.save_checkpoint(self.brain_name, self._step) export_ext = "onnx" new_checkpoint = ModelCheckpoint( - int(self.step), + int(self._step), f"{checkpoint_path}.{export_ext}", self._policy_mean_reward(), time.time(), @@ -199,7 +199,7 @@ def _increment_step(self, n_steps: int, name_behavior_id: str) -> None: Increment the step count of the trainer :param n_steps: number of steps to increment the step count by """ - self.step += n_steps + self._step += n_steps self._next_summary_step = self._get_next_interval_step(self.summary_freq) self._next_save_step = self._get_next_interval_step( self.trainer_settings.checkpoint_interval @@ -213,7 +213,7 @@ def _get_next_interval_step(self, interval: int) -> int: Get the next step count that should result in an action. :param interval: The interval between actions. """ - return self.step + (interval - self.step % interval) + return self._step + (interval - self._step % interval) def _write_summary(self, step: int) -> None: """ diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py index 55ac5a9ef1..f51be84169 100644 --- a/ml-agents/mlagents/trainers/trainer/trainer.py +++ b/ml-agents/mlagents/trainers/trainer/trainer.py @@ -45,7 +45,7 @@ def __init__( self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap) self.policy_queues: List[AgentManagerQueue[Policy]] = [] self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = [] - self.step: int = 0 + self._step: int = 0 self.artifact_path = artifact_path self.summary_freq = self.trainer_settings.summary_freq self.policies: Dict[str, Policy] = {} @@ -78,7 +78,7 @@ def get_step(self) -> int: Returns the number of steps the trainer has performed :return: the step count of the trainer """ - return self.step + return self._step @property def threaded(self) -> bool: diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py index c4a60f8b3a..de9c2271ae 100644 --- a/ml-agents/mlagents/trainers/trainer_controller.py +++ b/ml-agents/mlagents/trainers/trainer_controller.py @@ -208,7 +208,7 @@ def end_trainer_episodes(self) -> None: def reset_env_if_ready(self, env: EnvManager) -> None: # Get the sizes of the reward buffers. reward_buff = {k: list(t.reward_buffer) for (k, t) in self.trainers.items()} - curr_step = {k: int(t.step) for (k, t) in self.trainers.items()} + curr_step = {k: int(t.get_step) for (k, t) in self.trainers.items()} max_step = {k: int(t.get_max_steps) for (k, t) in self.trainers.items()} # Attempt to increment the lessons of the brains who # were ready. From 41455736e24435b77dd9703b54ac717d6f17611d Mon Sep 17 00:00:00 2001 From: Chris Goy Date: Fri, 12 Mar 2021 10:52:45 -0800 Subject: [PATCH 3/3] Update changelog for samples. (#5103) --- com.unity.ml-agents/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index 8b7afc94c0..cf151d111b 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -32,6 +32,7 @@ and this project adheres to ### Minor Changes #### com.unity.ml-agents / com.unity.ml-agents.extensions (C#) - Updated com.unity.barracuda to 1.3.2-preview. (#5084) +- Added 3D Ball to the `com.unity.ml-agents` samples. (#5077) #### ml-agents / ml-agents-envs / gym-unity (Python) - The `encoding_size` setting for RewardSignals has been deprecated. Please use `network_settings` instead. (#4982) - Sensor names are now passed through to `ObservationSpec.name`. (#5036)