From ffba52f64a029f2d765441110b9287418aab0dee Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Sat, 2 Nov 2019 14:50:51 -0700 Subject: [PATCH 1/6] more pylint cleanup --- .pylintrc | 2 +- .../mlagents/envs/base_unity_environment.py | 2 +- ml-agents-envs/mlagents/envs/env_manager.py | 7 ++++-- ml-agents/mlagents/trainers/bc/trainer.py | 12 +++++----- .../components/reward_signals/gail/signal.py | 22 +++++++++---------- ml-agents/mlagents/trainers/ppo/trainer.py | 8 +++---- ml-agents/mlagents/trainers/rl_trainer.py | 4 ++-- ml-agents/mlagents/trainers/sac/trainer.py | 10 ++++----- ml-agents/mlagents/trainers/trainer.py | 4 +--- 9 files changed, 35 insertions(+), 36 deletions(-) diff --git a/.pylintrc b/.pylintrc index 3c82fafc31..4e4646f722 100644 --- a/.pylintrc +++ b/.pylintrc @@ -41,7 +41,7 @@ disable = # We should fix these up ASAP # W0221: Parameters differ from overridden - W0221, + #W0221, # E0401: Unable to import... # E0611: No name '...' in module '...' diff --git a/ml-agents-envs/mlagents/envs/base_unity_environment.py b/ml-agents-envs/mlagents/envs/base_unity_environment.py index b588f31cb9..72e90cd042 100644 --- a/ml-agents-envs/mlagents/envs/base_unity_environment.py +++ b/ml-agents-envs/mlagents/envs/base_unity_environment.py @@ -9,9 +9,9 @@ class BaseUnityEnvironment(ABC): def step( self, vector_action: Optional[Dict] = None, - memory: Optional[Dict] = None, text_action: Optional[Dict] = None, value: Optional[Dict] = None, + custom_action: Dict[str, Any] = None, ) -> AllBrainInfo: pass diff --git a/ml-agents-envs/mlagents/envs/env_manager.py b/ml-agents-envs/mlagents/envs/env_manager.py index 8f3499104e..48d4aab92b 100644 --- a/ml-agents-envs/mlagents/envs/env_manager.py +++ b/ml-agents-envs/mlagents/envs/env_manager.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import List, Dict, NamedTuple, Optional +from typing import Any, List, Dict, NamedTuple, Optional from mlagents.envs.brain import AllBrainInfo, BrainParameters from mlagents.envs.policy import Policy from mlagents.envs.action_info import ActionInfo @@ -24,7 +24,10 @@ def step(self) -> List[EnvironmentStep]: @abstractmethod def reset( - self, config: Dict = None, train_mode: bool = True + self, + config: Dict = None, + train_mode: bool = True, + custom_reset_parameters: Any = None, ) -> List[EnvironmentStep]: pass diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py index 4bce079748..cf602fdbde 100644 --- a/ml-agents/mlagents/trainers/bc/trainer.py +++ b/ml-agents/mlagents/trainers/bc/trainer.py @@ -45,20 +45,20 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id): def add_experiences( self, - curr_info: AllBrainInfo, - next_info: AllBrainInfo, + curr_all_info: AllBrainInfo, + next_all_info: AllBrainInfo, take_action_outputs: ActionInfoOutputs, ) -> None: """ Adds experiences to each agent's experience history. - :param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). - :param next_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). + :param curr_all_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). + :param next_all_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). :param take_action_outputs: The outputs of the take action method. """ # Used to collect information about student performance. - info_student = curr_info[self.brain_name] - next_info_student = next_info[self.brain_name] + info_student = curr_all_info[self.brain_name] + next_info_student = next_all_info[self.brain_name] for agent_id in info_student.agents: self.evaluation_buffer[agent_id].last_brain_info = info_student diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py index 5539805be5..14299a3d0f 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py @@ -126,7 +126,7 @@ def check_config( def prepare_update( self, policy_model: LearningModel, - mini_batch_policy: Dict[str, np.ndarray], + mini_batch: Dict[str, np.ndarray], num_sequences: int, ) -> Dict[tf.Tensor, Any]: """ @@ -136,21 +136,21 @@ def prepare_update( :return: Feed_dict for update process. """ max_num_experiences = min( - len(mini_batch_policy["actions"]), + len(mini_batch["actions"]), len(self.demonstration_buffer.update_buffer["actions"]), ) # If num_sequences is less, we need to shorten the input batch. - for key, element in mini_batch_policy.items(): - mini_batch_policy[key] = element[:max_num_experiences] + for key, element in mini_batch.items(): + mini_batch[key] = element[:max_num_experiences] # Get batch from demo buffer mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch( - len(mini_batch_policy["actions"]), 1 + len(mini_batch["actions"]), 1 ) feed_dict: Dict[tf.Tensor, Any] = { self.model.done_expert_holder: mini_batch_demo["done"], - self.model.done_policy_holder: mini_batch_policy["done"], + self.model.done_policy_holder: mini_batch["done"], } if self.model.use_vail: @@ -158,20 +158,18 @@ def prepare_update( feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"]) if self.policy.use_continuous_act: - feed_dict[policy_model.selected_actions] = mini_batch_policy["actions"] + feed_dict[policy_model.selected_actions] = mini_batch["actions"] else: - feed_dict[policy_model.action_holder] = mini_batch_policy["actions"] + feed_dict[policy_model.action_holder] = mini_batch["actions"] if self.policy.use_vis_obs > 0: for i in range(len(policy_model.visual_in)): - feed_dict[policy_model.visual_in[i]] = mini_batch_policy[ - "visual_obs%d" % i - ] + feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i] feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[ "visual_obs%d" % i ] if self.policy.use_vec_obs: - feed_dict[policy_model.vector_in] = mini_batch_policy["vector_obs"] + feed_dict[policy_model.vector_in] = mini_batch["vector_obs"] feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"] self.has_updated = True return feed_dict diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 19d6a17eff..017072413a 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -79,15 +79,15 @@ def __init__( self.collected_rewards[_reward_signal] = {} def process_experiences( - self, current_info: AllBrainInfo, new_info: AllBrainInfo + self, current_info: AllBrainInfo, next_info: AllBrainInfo ) -> None: """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. :param current_info: Dictionary of all current brains and corresponding BrainInfo. - :param new_info: Dictionary of all next brains and corresponding BrainInfo. + :param next_info: Dictionary of all next brains and corresponding BrainInfo. """ - info = new_info[self.brain_name] + info = next_info[self.brain_name] if self.is_training: self.policy.update_normalization(info.vector_observations) for l in range(len(info.agents)): @@ -228,7 +228,7 @@ def update_policy(self): number_experiences=buffer_length, mean_return=float(np.mean(self.cumulative_returns_since_policy_update)), ) - self.cumulative_returns_since_policy_update = [] + self.cumulative_returns_since_policy_update.clear() # Make sure batch_size is a multiple of sequence length. During training, we # will need to reshape the data into a batch_size x sequence_length tensor. diff --git a/ml-agents/mlagents/trainers/rl_trainer.py b/ml-agents/mlagents/trainers/rl_trainer.py index 9cef6fb877..870cf3f9ff 100644 --- a/ml-agents/mlagents/trainers/rl_trainer.py +++ b/ml-agents/mlagents/trainers/rl_trainer.py @@ -248,7 +248,7 @@ def add_policy_outputs( :param agent_idx: the index of the Agent agent_id """ raise UnityTrainerException( - "The process_experiences method was not implemented." + "The add_policy_outputs method was not implemented." ) def add_rewards_outputs( @@ -270,5 +270,5 @@ def add_rewards_outputs( :param agent_next_idx: the index of the Agent agent_id in the next brain info """ raise UnityTrainerException( - "The process_experiences method was not implemented." + "The add_rewards_outputs method was not implemented." ) diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index c82a47e521..df3b8b2c9b 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -5,7 +5,7 @@ import logging from collections import defaultdict -from typing import List, Dict +from typing import Dict import os import numpy as np @@ -159,14 +159,14 @@ def add_rewards_outputs( ) def process_experiences( - self, current_info: AllBrainInfo, new_info: AllBrainInfo + self, current_info: AllBrainInfo, next_info: AllBrainInfo ) -> None: """ Checks agent histories for processing condition, and processes them as necessary. :param current_info: Dictionary of all current brains and corresponding BrainInfo. - :param new_info: Dictionary of all next brains and corresponding BrainInfo. + :param next_info: Dictionary of all next brains and corresponding BrainInfo. """ - info = new_info[self.brain_name] + info = next_info[self.brain_name] if self.is_training: self.policy.update_normalization(info.vector_observations) for l in range(len(info.agents)): @@ -254,7 +254,7 @@ def update_sac_policy(self) -> None: is greater than 1 and the reward signals are not updated in parallel. """ - self.cumulative_returns_since_policy_update: List[float] = [] + self.cumulative_returns_since_policy_update.clear() n_sequences = max( int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1 ) diff --git a/ml-agents/mlagents/trainers/trainer.py b/ml-agents/mlagents/trainers/trainer.py index cc239c8668..796085368e 100644 --- a/ml-agents/mlagents/trainers/trainer.py +++ b/ml-agents/mlagents/trainers/trainer.py @@ -246,9 +246,7 @@ def add_experiences( :param next_all_info: Dictionary of all current brains and corresponding BrainInfo. :param take_action_outputs: The outputs of the Policy's get_action method. """ - raise UnityTrainerException( - "The process_experiences method was not implemented." - ) + raise UnityTrainerException("The add_experiences method was not implemented.") def process_experiences( self, current_info: AllBrainInfo, next_info: AllBrainInfo From 6b4bc1fc2134167a331ebc954e8ea3b9bb8cef6f Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Sat, 2 Nov 2019 15:32:20 -0700 Subject: [PATCH 2/6] define some tf objects in inits --- .pylintrc | 6 +-- .../components/reward_signals/gail/model.py | 6 ++- ml-agents/mlagents/trainers/models.py | 9 +++- ml-agents/mlagents/trainers/sac/models.py | 41 ++++++++++++++++++- 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/.pylintrc b/.pylintrc index 4e4646f722..8274f6e261 100644 --- a/.pylintrc +++ b/.pylintrc @@ -37,11 +37,7 @@ disable = W0703, # W0201: Attribute '...' defined outside __init__ - W0201, - - # We should fix these up ASAP - # W0221: Parameters differ from overridden - #W0221, + #W0201, # E0401: Unable to import... # E0611: No name '...' in module '...' diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py index 7d106ddd97..0a8e3b711b 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py @@ -1,4 +1,4 @@ -from typing import Tuple, List +from typing import List, Optional, Tuple import tensorflow as tf from mlagents.trainers.models import LearningModel @@ -37,6 +37,10 @@ def __init__( self.gradient_penalty_weight = gradient_penalty_weight self.use_vail = use_vail self.use_actions = use_actions # True # Not using actions + + self.noise: Optional[tf.Tensor] = None + self.z: Optional[tf.Tensor] = None + self.make_inputs() self.create_network() self.create_loss(learning_rate) diff --git a/ml-agents/mlagents/trainers/models.py b/ml-agents/mlagents/trainers/models.py index 3896fbb567..70d779bb10 100644 --- a/ml-agents/mlagents/trainers/models.py +++ b/ml-agents/mlagents/trainers/models.py @@ -1,6 +1,6 @@ import logging from enum import Enum -from typing import Callable, List +from typing import Callable, Dict, List, Optional import numpy as np import tensorflow as tf @@ -85,6 +85,12 @@ def __init__( trainable=False, dtype=tf.int32, ) + self.value_heads: Dict[str, tf.Tensor] = {} + self.normalization_steps: Optional[tf.Variable] = None + self.running_mean: Optional[tf.Variable] = None + self.running_variance: Optional[tf.Variable] = None + self.update_normalization: Optional[tf.Operation] = None + self.value: Optional[tf.Tensor] = None @staticmethod def create_global_steps(): @@ -573,7 +579,6 @@ def create_value_heads(self, stream_names, hidden_input): :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top of the hidden input. """ - self.value_heads = {} for name in stream_names: value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name)) self.value_heads[name] = value diff --git a/ml-agents/mlagents/trainers/sac/models.py b/ml-agents/mlagents/trainers/sac/models.py index 390eb04ed9..1dd54e25de 100644 --- a/ml-agents/mlagents/trainers/sac/models.py +++ b/ml-agents/mlagents/trainers/sac/models.py @@ -1,6 +1,6 @@ import logging import numpy as np -from typing import Optional +from typing import Dict, List, Optional import tensorflow as tf from mlagents.trainers.models import LearningModel, LearningRateSchedule, EncoderType @@ -46,9 +46,41 @@ def __init__( self.activ_fn = self.swish self.policy_memory_in: Optional[tf.Tensor] = None + self.policy_memory_out: Optional[tf.Tensor] = None self.value_memory_in: Optional[tf.Tensor] = None + self.value_memory_out: Optional[tf.Tensor] = None + self.q1: Optional[tf.Tensor] = None + self.q2: Optional[tf.Tensor] = None + self.q1_p: Optional[tf.Tensor] = None + self.q2_p: Optional[tf.Tensor] = None self.q1_memory_in: Optional[tf.Tensor] = None self.q2_memory_in: Optional[tf.Tensor] = None + self.q1_memory_out: Optional[tf.Tensor] = None + self.q2_memory_out: Optional[tf.Tensor] = None + self.action_holder: Optional[tf.Tensor] = None + self.prev_action: Optional[tf.Tensor] = None + self.action_masks: Optional[tf.Tensor] = None + self.external_action_in: Optional[tf.Tensor] = None + self.log_sigma_sq: Optional[tf.Tensor] = None + self.entropy: Optional[tf.Tensor] = None + self.deterministic_output: Optional[tf.Tensor] = None + self.all_log_probs: Optional[tf.Tensor] = None + self.normalized_logprobs: Optional[tf.Tensor] = None + self.action_probs: Optional[tf.Tensor] = None + self.selected_actions: Optional[tf.Tensor] = None + self.output: Optional[tf.Tensor] = None + self.output_oh: Optional[tf.Tensor] = None + self.output_pre: Optional[tf.Tensor] = None + + self.value_vars = None + self.q_vars = None + self.critic_vars = None + self.policy_vars = None + + self.q1_heads: Optional[Dict[str, tf.Tensor]] = None + self.q2_heads: Optional[Dict[str, tf.Tensor]] = None + self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None + self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None def get_vars(self, scope): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) @@ -350,7 +382,6 @@ def create_sac_value_head( :param h_size: size of hidden layers for value network :param scope: TF scope for value network. """ - self.value_heads = {} with tf.variable_scope(scope): value_hidden = self.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "encoder", False @@ -676,6 +707,12 @@ def __init__( if num_layers < 1: num_layers = 1 + self.target_init_op: List[tf.Tensor] = [] + self.target_update_op: List[tf.Tensor] = [] + self.update_batch_policy: Optional[tf.Operation] = None + self.update_batch_value: Optional[tf.Operation] = None + self.update_batch_entropy: Optional[tf.Operation] = None + self.policy_network = SACPolicyNetwork( brain=brain, m_size=m_size, From 40a0917661b3ec3c1dc2255cd8ca8158f1b302a4 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Sat, 2 Nov 2019 15:42:38 -0700 Subject: [PATCH 3/6] more fixup --- ml-agents/mlagents/trainers/ppo/models.py | 9 ++++++++- ml-agents/mlagents/trainers/sac/policy.py | 7 ++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/models.py b/ml-agents/mlagents/trainers/ppo/models.py index deac591129..06cf0dadc5 100644 --- a/ml-agents/mlagents/trainers/ppo/models.py +++ b/ml-agents/mlagents/trainers/ppo/models.py @@ -1,7 +1,9 @@ import logging -import numpy as np +from typing import Optional +import numpy as np import tensorflow as tf + from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule logger = logging.getLogger("mlagents.trainers") @@ -46,6 +48,11 @@ def __init__( LearningModel.__init__( self, m_size, normalize, use_recurrent, brain, seed, stream_names ) + + self.optimizer: Optional[tf.train.AdamOptimizer] = None + self.grads = None + self.update_batch: Optional[tf.Operation] = None + if num_layers < 1: num_layers = 1 if brain.vector_action_space_type == "continuous": diff --git a/ml-agents/mlagents/trainers/sac/policy.py b/ml-agents/mlagents/trainers/sac/policy.py index ef13184f7e..02ba111947 100644 --- a/ml-agents/mlagents/trainers/sac/policy.py +++ b/ml-agents/mlagents/trainers/sac/policy.py @@ -188,7 +188,12 @@ def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]: @timed def update( - self, mini_batch: Dict[str, Any], num_sequences: int, update_target: bool = True + # pylint: disable=arguments-differ + # TODO ervteng FIX ME + self, + mini_batch: Dict[str, Any], + num_sequences: int, + update_target: bool = True, ) -> Dict[str, float]: """ Updates model using buffer. From b692198464421226c6548257e09cc1cda410daa6 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Sat, 2 Nov 2019 16:02:24 -0700 Subject: [PATCH 4/6] pylint settings (fails without require_serial) --- .pre-commit-config.yaml | 1 - .pylintrc | 10 ++++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a1861765c..f083483106 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -66,7 +66,6 @@ repos: .*_pb2_grpc.py| .*/tests/.* )$ - require_serial: true # "Local" hooks, see https://pre-commit.com/#repository-local-hooks - repo: local diff --git a/.pylintrc b/.pylintrc index 8274f6e261..11d1f0b75c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -12,9 +12,10 @@ disable = # disabled because black handles this C0301,C0330, - # C0115: Missing class docstring # C0114: Missing module docstring - C0115,C0114, + # C0115: Missing class docstring + # C0116: Missing function or method docstring + C0114,C0115,C0116, # All convention and refactor for now C,R, @@ -30,15 +31,12 @@ disable = # W0107: Unnecessary pass statement W0107, - # W0511 TODO + # W0511 "TODO" W0511, # W0703: Catching too general exception Exception W0703, - # W0201: Attribute '...' defined outside __init__ - #W0201, - # E0401: Unable to import... # E0611: No name '...' in module '...' # need to look into these, probably namespace packages From f95f92a30e0bb88bf5dd726191a69a562567dc26 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Sat, 2 Nov 2019 16:12:19 -0700 Subject: [PATCH 5/6] re-enable require_serial --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f083483106..2a1861765c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -66,6 +66,7 @@ repos: .*_pb2_grpc.py| .*/tests/.* )$ + require_serial: true # "Local" hooks, see https://pre-commit.com/#repository-local-hooks - repo: local From dffe80ec5b28faf156bf00720c8942df5fc2b04b Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Mon, 4 Nov 2019 09:20:13 -0800 Subject: [PATCH 6/6] make update() match parent --- ml-agents/mlagents/trainers/sac/policy.py | 11 +++-------- ml-agents/mlagents/trainers/sac/trainer.py | 4 +--- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/policy.py b/ml-agents/mlagents/trainers/sac/policy.py index 02ba111947..fa6eb10607 100644 --- a/ml-agents/mlagents/trainers/sac/policy.py +++ b/ml-agents/mlagents/trainers/sac/policy.py @@ -188,12 +188,7 @@ def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]: @timed def update( - # pylint: disable=arguments-differ - # TODO ervteng FIX ME - self, - mini_batch: Dict[str, Any], - num_sequences: int, - update_target: bool = True, + self, mini_batch: Dict[str, Any], num_sequences: int ) -> Dict[str, float]: """ Updates model using buffer. @@ -210,8 +205,8 @@ def update( update_vals = self._execute_model(feed_dict, self.update_dict) for stat_name, update_name in stats_needed.items(): update_stats[stat_name] = update_vals[update_name] - if update_target: - self.sess.run(self.model.target_update_op) + # Update target network. By default, target update happens at every policy update. + self.sess.run(self.model.target_update_op) return update_stats def update_reward_signals( diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index df3b8b2c9b..6810e5dd0d 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -278,9 +278,7 @@ def update_sac_policy(self) -> None: "{}_rewards".format(name) ] = signal.evaluate_batch(sampled_minibatch).scaled_reward - update_stats = self.policy.update( - sampled_minibatch, n_sequences, update_target=True - ) + update_stats = self.policy.update(sampled_minibatch, n_sequences) for stat_name, value in update_stats.items(): batch_update_stats[stat_name].append(value)