-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
more pylint fixes #2842
more pylint fixes #2842
Changes from 5 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,9 +9,9 @@ class BaseUnityEnvironment(ABC): | |
def step( | ||
self, | ||
vector_action: Optional[Dict] = None, | ||
memory: Optional[Dict] = None, | ||
text_action: Optional[Dict] = None, | ||
value: Optional[Dict] = None, | ||
custom_action: Dict[str, Any] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are being passed (for now) by the implementations, but will likely go away soon. |
||
) -> AllBrainInfo: | ||
pass | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import List, Dict, NamedTuple, Optional | ||
from typing import Any, List, Dict, NamedTuple, Optional | ||
from mlagents.envs.brain import AllBrainInfo, BrainParameters | ||
from mlagents.envs.policy import Policy | ||
from mlagents.envs.action_info import ActionInfo | ||
|
@@ -24,7 +24,10 @@ def step(self) -> List[EnvironmentStep]: | |
|
||
@abstractmethod | ||
def reset( | ||
self, config: Dict = None, train_mode: bool = True | ||
self, | ||
config: Dict = None, | ||
train_mode: bool = True, | ||
custom_reset_parameters: Any = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Being passed by implementations. |
||
) -> List[EnvironmentStep]: | ||
pass | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,20 +45,20 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id): | |
|
||
def add_experiences( | ||
self, | ||
curr_info: AllBrainInfo, | ||
next_info: AllBrainInfo, | ||
curr_all_info: AllBrainInfo, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed these to match parent |
||
next_all_info: AllBrainInfo, | ||
take_action_outputs: ActionInfoOutputs, | ||
) -> None: | ||
""" | ||
Adds experiences to each agent's experience history. | ||
:param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). | ||
:param next_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). | ||
:param curr_all_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). | ||
:param next_all_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo). | ||
:param take_action_outputs: The outputs of the take action method. | ||
""" | ||
|
||
# Used to collect information about student performance. | ||
info_student = curr_info[self.brain_name] | ||
next_info_student = next_info[self.brain_name] | ||
info_student = curr_all_info[self.brain_name] | ||
next_info_student = next_all_info[self.brain_name] | ||
for agent_id in info_student.agents: | ||
self.evaluation_buffer[agent_id].last_brain_info = info_student | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -126,7 +126,7 @@ def check_config( | |
def prepare_update( | ||
self, | ||
policy_model: LearningModel, | ||
mini_batch_policy: Dict[str, np.ndarray], | ||
mini_batch: Dict[str, np.ndarray], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed to match parent |
||
num_sequences: int, | ||
) -> Dict[tf.Tensor, Any]: | ||
""" | ||
|
@@ -136,42 +136,40 @@ def prepare_update( | |
:return: Feed_dict for update process. | ||
""" | ||
max_num_experiences = min( | ||
len(mini_batch_policy["actions"]), | ||
len(mini_batch["actions"]), | ||
len(self.demonstration_buffer.update_buffer["actions"]), | ||
) | ||
# If num_sequences is less, we need to shorten the input batch. | ||
for key, element in mini_batch_policy.items(): | ||
mini_batch_policy[key] = element[:max_num_experiences] | ||
for key, element in mini_batch.items(): | ||
mini_batch[key] = element[:max_num_experiences] | ||
|
||
# Get batch from demo buffer | ||
mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch( | ||
len(mini_batch_policy["actions"]), 1 | ||
len(mini_batch["actions"]), 1 | ||
) | ||
|
||
feed_dict: Dict[tf.Tensor, Any] = { | ||
self.model.done_expert_holder: mini_batch_demo["done"], | ||
self.model.done_policy_holder: mini_batch_policy["done"], | ||
self.model.done_policy_holder: mini_batch["done"], | ||
} | ||
|
||
if self.model.use_vail: | ||
feed_dict[self.model.use_noise] = [1] | ||
|
||
feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"]) | ||
if self.policy.use_continuous_act: | ||
feed_dict[policy_model.selected_actions] = mini_batch_policy["actions"] | ||
feed_dict[policy_model.selected_actions] = mini_batch["actions"] | ||
else: | ||
feed_dict[policy_model.action_holder] = mini_batch_policy["actions"] | ||
feed_dict[policy_model.action_holder] = mini_batch["actions"] | ||
|
||
if self.policy.use_vis_obs > 0: | ||
for i in range(len(policy_model.visual_in)): | ||
feed_dict[policy_model.visual_in[i]] = mini_batch_policy[ | ||
"visual_obs%d" % i | ||
] | ||
feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i] | ||
feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[ | ||
"visual_obs%d" % i | ||
] | ||
if self.policy.use_vec_obs: | ||
feed_dict[policy_model.vector_in] = mini_batch_policy["vector_obs"] | ||
feed_dict[policy_model.vector_in] = mini_batch["vector_obs"] | ||
feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"] | ||
self.has_updated = True | ||
return feed_dict |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,15 +79,15 @@ def __init__( | |
self.collected_rewards[_reward_signal] = {} | ||
|
||
def process_experiences( | ||
self, current_info: AllBrainInfo, new_info: AllBrainInfo | ||
self, current_info: AllBrainInfo, next_info: AllBrainInfo | ||
) -> None: | ||
""" | ||
Checks agent histories for processing condition, and processes them as necessary. | ||
Processing involves calculating value and advantage targets for model updating step. | ||
:param current_info: Dictionary of all current brains and corresponding BrainInfo. | ||
:param new_info: Dictionary of all next brains and corresponding BrainInfo. | ||
:param next_info: Dictionary of all next brains and corresponding BrainInfo. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We seemed to be 50/50 on new vs next for the names here. I went with next_info because it seems more consistent with current_info |
||
""" | ||
info = new_info[self.brain_name] | ||
info = next_info[self.brain_name] | ||
if self.is_training: | ||
self.policy.update_normalization(info.vector_observations) | ||
for l in range(len(info.agents)): | ||
|
@@ -228,7 +228,7 @@ def update_policy(self): | |
number_experiences=buffer_length, | ||
mean_return=float(np.mean(self.cumulative_returns_since_policy_update)), | ||
) | ||
self.cumulative_returns_since_policy_update = [] | ||
self.cumulative_returns_since_policy_update.clear() | ||
|
||
# Make sure batch_size is a multiple of sequence length. During training, we | ||
# will need to reshape the data into a batch_size x sequence_length tensor. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
import logging | ||
import numpy as np | ||
from typing import Optional | ||
from typing import Dict, List, Optional | ||
|
||
import tensorflow as tf | ||
from mlagents.trainers.models import LearningModel, LearningRateSchedule, EncoderType | ||
|
@@ -46,9 +46,41 @@ def __init__( | |
self.activ_fn = self.swish | ||
|
||
self.policy_memory_in: Optional[tf.Tensor] = None | ||
self.policy_memory_out: Optional[tf.Tensor] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if there's a better way to group all of these. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh, this is ugly (one more reason to move to TF 2.0). I can take a stab at grouping by comment (e.g. divvy up policy vs. critic vs. output tensors) but they are all needed. I'm assuming we'll have a similar block in the PPO model? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's OK as it is. One way to DRY it up a bit would be something like
but that might not be a good abstraction (and probably not a good name). Surprisingly it's not complaining about PPO right now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm the class nesting doesn't seem very obvious. I'd almost want a group that is model inputs and one that is model outputs - but that seems to require more code changes, so let's leave it for now. |
||
self.value_memory_in: Optional[tf.Tensor] = None | ||
self.value_memory_out: Optional[tf.Tensor] = None | ||
self.q1: Optional[tf.Tensor] = None | ||
self.q2: Optional[tf.Tensor] = None | ||
self.q1_p: Optional[tf.Tensor] = None | ||
self.q2_p: Optional[tf.Tensor] = None | ||
self.q1_memory_in: Optional[tf.Tensor] = None | ||
self.q2_memory_in: Optional[tf.Tensor] = None | ||
self.q1_memory_out: Optional[tf.Tensor] = None | ||
self.q2_memory_out: Optional[tf.Tensor] = None | ||
self.action_holder: Optional[tf.Tensor] = None | ||
self.prev_action: Optional[tf.Tensor] = None | ||
self.action_masks: Optional[tf.Tensor] = None | ||
self.external_action_in: Optional[tf.Tensor] = None | ||
self.log_sigma_sq: Optional[tf.Tensor] = None | ||
self.entropy: Optional[tf.Tensor] = None | ||
self.deterministic_output: Optional[tf.Tensor] = None | ||
self.all_log_probs: Optional[tf.Tensor] = None | ||
self.normalized_logprobs: Optional[tf.Tensor] = None | ||
self.action_probs: Optional[tf.Tensor] = None | ||
self.selected_actions: Optional[tf.Tensor] = None | ||
self.output: Optional[tf.Tensor] = None | ||
self.output_oh: Optional[tf.Tensor] = None | ||
self.output_pre: Optional[tf.Tensor] = None | ||
|
||
self.value_vars = None | ||
self.q_vars = None | ||
self.critic_vars = None | ||
self.policy_vars = None | ||
|
||
self.q1_heads: Optional[Dict[str, tf.Tensor]] = None | ||
self.q2_heads: Optional[Dict[str, tf.Tensor]] = None | ||
self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None | ||
self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None | ||
|
||
def get_vars(self, scope): | ||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) | ||
|
@@ -350,7 +382,6 @@ def create_sac_value_head( | |
:param h_size: size of hidden layers for value network | ||
:param scope: TF scope for value network. | ||
""" | ||
self.value_heads = {} | ||
with tf.variable_scope(scope): | ||
value_hidden = self.create_vector_observation_encoder( | ||
hidden_input, h_size, self.activ_fn, num_layers, "encoder", False | ||
|
@@ -676,6 +707,12 @@ def __init__( | |
if num_layers < 1: | ||
num_layers = 1 | ||
|
||
self.target_init_op: List[tf.Tensor] = [] | ||
self.target_update_op: List[tf.Tensor] = [] | ||
self.update_batch_policy: Optional[tf.Operation] = None | ||
self.update_batch_value: Optional[tf.Operation] = None | ||
self.update_batch_entropy: Optional[tf.Operation] = None | ||
|
||
self.policy_network = SACPolicyNetwork( | ||
brain=brain, | ||
m_size=m_size, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -188,7 +188,12 @@ def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]: | |
|
||
@timed | ||
def update( | ||
self, mini_batch: Dict[str, Any], num_sequences: int, update_target: bool = True | ||
# pylint: disable=arguments-differ | ||
# TODO ervteng FIX ME | ||
self, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ervteng Can you take a look at this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, this is a subtlety of the SAC algorithm. There are two value networks, a target and a current one. The target is used to bootstrap the current one, and is updated periodically towards the weights of the current. Most implementations of SAC let you vary when this update happens. Initially, I had an additional hyperparameter that did just that, but removed it b/c realistically you can get away with just adjusting tau (and less hyperparameters are better for most users). I think it's safe to remove it for now. A researcher might want to vary this, but then again a researcher can edit the Python code. Perhaps we can add a comment -> I'll add it below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Line 213: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good, I'll apply that. If you want to keep the flexibility, you could so something like move all the current There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 I'll probably do something like this when we rehaul the Policy and Trainer APIs, but will hold off for this particular version of the code. |
||
mini_batch: Dict[str, Any], | ||
num_sequences: int, | ||
update_target: bool = True, | ||
) -> Dict[str, float]: | ||
""" | ||
Updates model using buffer. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was missed in the memory cleanup