Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more pylint fixes #2842

Merged
merged 6 commits into from
Nov 4, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ disable =
# disabled because black handles this
C0301,C0330,

# C0115: Missing class docstring
# C0114: Missing module docstring
C0115,C0114,
# C0115: Missing class docstring
# C0116: Missing function or method docstring
C0114,C0115,C0116,

# All convention and refactor for now
C,R,
Expand All @@ -30,19 +31,12 @@ disable =
# W0107: Unnecessary pass statement
W0107,

# W0511 TODO
# W0511 "TODO"
W0511,

# W0703: Catching too general exception Exception
W0703,

# W0201: Attribute '...' defined outside __init__
W0201,

# We should fix these up ASAP
# W0221: Parameters differ from overridden
W0221,

# E0401: Unable to import...
# E0611: No name '...' in module '...'
# need to look into these, probably namespace packages
Expand Down
2 changes: 1 addition & 1 deletion ml-agents-envs/mlagents/envs/base_unity_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ class BaseUnityEnvironment(ABC):
def step(
self,
vector_action: Optional[Dict] = None,
memory: Optional[Dict] = None,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was missed in the memory cleanup

text_action: Optional[Dict] = None,
value: Optional[Dict] = None,
custom_action: Dict[str, Any] = None,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are being passed (for now) by the implementations, but will likely go away soon.

) -> AllBrainInfo:
pass

Expand Down
7 changes: 5 additions & 2 deletions ml-agents-envs/mlagents/envs/env_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import List, Dict, NamedTuple, Optional
from typing import Any, List, Dict, NamedTuple, Optional
from mlagents.envs.brain import AllBrainInfo, BrainParameters
from mlagents.envs.policy import Policy
from mlagents.envs.action_info import ActionInfo
Expand All @@ -24,7 +24,10 @@ def step(self) -> List[EnvironmentStep]:

@abstractmethod
def reset(
self, config: Dict = None, train_mode: bool = True
self,
config: Dict = None,
train_mode: bool = True,
custom_reset_parameters: Any = None,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Being passed by implementations.

) -> List[EnvironmentStep]:
pass

Expand Down
12 changes: 6 additions & 6 deletions ml-agents/mlagents/trainers/bc/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,20 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id):

def add_experiences(
self,
curr_info: AllBrainInfo,
next_info: AllBrainInfo,
curr_all_info: AllBrainInfo,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed these to match parent

next_all_info: AllBrainInfo,
take_action_outputs: ActionInfoOutputs,
) -> None:
"""
Adds experiences to each agent's experience history.
:param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
:param next_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
:param curr_all_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
:param next_all_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
:param take_action_outputs: The outputs of the take action method.
"""

# Used to collect information about student performance.
info_student = curr_info[self.brain_name]
next_info_student = next_info[self.brain_name]
info_student = curr_all_info[self.brain_name]
next_info_student = next_all_info[self.brain_name]
for agent_id in info_student.agents:
self.evaluation_buffer[agent_id].last_brain_info = info_student

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Tuple, List
from typing import List, Optional, Tuple

import tensorflow as tf
from mlagents.trainers.models import LearningModel
Expand Down Expand Up @@ -37,6 +37,10 @@ def __init__(
self.gradient_penalty_weight = gradient_penalty_weight
self.use_vail = use_vail
self.use_actions = use_actions # True # Not using actions

self.noise: Optional[tf.Tensor] = None
self.z: Optional[tf.Tensor] = None

self.make_inputs()
self.create_network()
self.create_loss(learning_rate)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def check_config(
def prepare_update(
self,
policy_model: LearningModel,
mini_batch_policy: Dict[str, np.ndarray],
mini_batch: Dict[str, np.ndarray],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed to match parent

num_sequences: int,
) -> Dict[tf.Tensor, Any]:
"""
Expand All @@ -136,42 +136,40 @@ def prepare_update(
:return: Feed_dict for update process.
"""
max_num_experiences = min(
len(mini_batch_policy["actions"]),
len(mini_batch["actions"]),
len(self.demonstration_buffer.update_buffer["actions"]),
)
# If num_sequences is less, we need to shorten the input batch.
for key, element in mini_batch_policy.items():
mini_batch_policy[key] = element[:max_num_experiences]
for key, element in mini_batch.items():
mini_batch[key] = element[:max_num_experiences]

# Get batch from demo buffer
mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch(
len(mini_batch_policy["actions"]), 1
len(mini_batch["actions"]), 1
)

feed_dict: Dict[tf.Tensor, Any] = {
self.model.done_expert_holder: mini_batch_demo["done"],
self.model.done_policy_holder: mini_batch_policy["done"],
self.model.done_policy_holder: mini_batch["done"],
}

if self.model.use_vail:
feed_dict[self.model.use_noise] = [1]

feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
if self.policy.use_continuous_act:
feed_dict[policy_model.selected_actions] = mini_batch_policy["actions"]
feed_dict[policy_model.selected_actions] = mini_batch["actions"]
else:
feed_dict[policy_model.action_holder] = mini_batch_policy["actions"]
feed_dict[policy_model.action_holder] = mini_batch["actions"]

if self.policy.use_vis_obs > 0:
for i in range(len(policy_model.visual_in)):
feed_dict[policy_model.visual_in[i]] = mini_batch_policy[
"visual_obs%d" % i
]
feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[
"visual_obs%d" % i
]
if self.policy.use_vec_obs:
feed_dict[policy_model.vector_in] = mini_batch_policy["vector_obs"]
feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"]
self.has_updated = True
return feed_dict
9 changes: 7 additions & 2 deletions ml-agents/mlagents/trainers/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from enum import Enum
from typing import Callable, List
from typing import Callable, Dict, List, Optional

import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -85,6 +85,12 @@ def __init__(
trainable=False,
dtype=tf.int32,
)
self.value_heads: Dict[str, tf.Tensor] = {}
self.normalization_steps: Optional[tf.Variable] = None
self.running_mean: Optional[tf.Variable] = None
self.running_variance: Optional[tf.Variable] = None
self.update_normalization: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None

@staticmethod
def create_global_steps():
Expand Down Expand Up @@ -573,7 +579,6 @@ def create_value_heads(self, stream_names, hidden_input):
:param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
of the hidden input.
"""
self.value_heads = {}
for name in stream_names:
value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
self.value_heads[name] = value
Expand Down
9 changes: 8 additions & 1 deletion ml-agents/mlagents/trainers/ppo/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import numpy as np
from typing import Optional

import numpy as np
import tensorflow as tf

from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule

logger = logging.getLogger("mlagents.trainers")
Expand Down Expand Up @@ -46,6 +48,11 @@ def __init__(
LearningModel.__init__(
self, m_size, normalize, use_recurrent, brain, seed, stream_names
)

self.optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None

if num_layers < 1:
num_layers = 1
if brain.vector_action_space_type == "continuous":
Expand Down
8 changes: 4 additions & 4 deletions ml-agents/mlagents/trainers/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,15 @@ def __init__(
self.collected_rewards[_reward_signal] = {}

def process_experiences(
self, current_info: AllBrainInfo, new_info: AllBrainInfo
self, current_info: AllBrainInfo, next_info: AllBrainInfo
) -> None:
"""
Checks agent histories for processing condition, and processes them as necessary.
Processing involves calculating value and advantage targets for model updating step.
:param current_info: Dictionary of all current brains and corresponding BrainInfo.
:param new_info: Dictionary of all next brains and corresponding BrainInfo.
:param next_info: Dictionary of all next brains and corresponding BrainInfo.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We seemed to be 50/50 on new vs next for the names here. I went with next_info because it seems more consistent with current_info

"""
info = new_info[self.brain_name]
info = next_info[self.brain_name]
if self.is_training:
self.policy.update_normalization(info.vector_observations)
for l in range(len(info.agents)):
Expand Down Expand Up @@ -228,7 +228,7 @@ def update_policy(self):
number_experiences=buffer_length,
mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),
)
self.cumulative_returns_since_policy_update = []
self.cumulative_returns_since_policy_update.clear()

# Make sure batch_size is a multiple of sequence length. During training, we
# will need to reshape the data into a batch_size x sequence_length tensor.
Expand Down
4 changes: 2 additions & 2 deletions ml-agents/mlagents/trainers/rl_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def add_policy_outputs(
:param agent_idx: the index of the Agent agent_id
"""
raise UnityTrainerException(
"The process_experiences method was not implemented."
"The add_policy_outputs method was not implemented."
)

def add_rewards_outputs(
Expand All @@ -270,5 +270,5 @@ def add_rewards_outputs(
:param agent_next_idx: the index of the Agent agent_id in the next brain info
"""
raise UnityTrainerException(
"The process_experiences method was not implemented."
"The add_rewards_outputs method was not implemented."
)
41 changes: 39 additions & 2 deletions ml-agents/mlagents/trainers/sac/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import numpy as np
from typing import Optional
from typing import Dict, List, Optional

import tensorflow as tf
from mlagents.trainers.models import LearningModel, LearningRateSchedule, EncoderType
Expand Down Expand Up @@ -46,9 +46,41 @@ def __init__(
self.activ_fn = self.swish

self.policy_memory_in: Optional[tf.Tensor] = None
self.policy_memory_out: Optional[tf.Tensor] = None
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if there's a better way to group all of these.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, this is ugly (one more reason to move to TF 2.0). I can take a stab at grouping by comment (e.g. divvy up policy vs. critic vs. output tensors) but they are all needed.

I'm assuming we'll have a similar block in the PPO model?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's OK as it is. One way to DRY it up a bit would be something like

class QInfo
    self.q: Optional[tf.Tensor] = None
    self.q_p: Optional[tf.Tensor] = None
    self.q_memory_in: Optional[tf.Tensor] = None
    self.q_memory_out: Optional[tf.Tensor] = None
    self.q_heads: Optional[Dict[str, tf.Tensor]] = None
    self.q_pheads: Optional[Dict[str, tf.Tensor]] = None
    ....
class SACNetwork(LearningModel):
    self.q1 = QInfo()
    self.q2 = QInfo()

but that might not be a good abstraction (and probably not a good name).

Surprisingly it's not complaining about PPO right now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm the class nesting doesn't seem very obvious. I'd almost want a group that is model inputs and one that is model outputs - but that seems to require more code changes, so let's leave it for now.

self.value_memory_in: Optional[tf.Tensor] = None
self.value_memory_out: Optional[tf.Tensor] = None
self.q1: Optional[tf.Tensor] = None
self.q2: Optional[tf.Tensor] = None
self.q1_p: Optional[tf.Tensor] = None
self.q2_p: Optional[tf.Tensor] = None
self.q1_memory_in: Optional[tf.Tensor] = None
self.q2_memory_in: Optional[tf.Tensor] = None
self.q1_memory_out: Optional[tf.Tensor] = None
self.q2_memory_out: Optional[tf.Tensor] = None
self.action_holder: Optional[tf.Tensor] = None
self.prev_action: Optional[tf.Tensor] = None
self.action_masks: Optional[tf.Tensor] = None
self.external_action_in: Optional[tf.Tensor] = None
self.log_sigma_sq: Optional[tf.Tensor] = None
self.entropy: Optional[tf.Tensor] = None
self.deterministic_output: Optional[tf.Tensor] = None
self.all_log_probs: Optional[tf.Tensor] = None
self.normalized_logprobs: Optional[tf.Tensor] = None
self.action_probs: Optional[tf.Tensor] = None
self.selected_actions: Optional[tf.Tensor] = None
self.output: Optional[tf.Tensor] = None
self.output_oh: Optional[tf.Tensor] = None
self.output_pre: Optional[tf.Tensor] = None

self.value_vars = None
self.q_vars = None
self.critic_vars = None
self.policy_vars = None

self.q1_heads: Optional[Dict[str, tf.Tensor]] = None
self.q2_heads: Optional[Dict[str, tf.Tensor]] = None
self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None
self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None

def get_vars(self, scope):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
Expand Down Expand Up @@ -350,7 +382,6 @@ def create_sac_value_head(
:param h_size: size of hidden layers for value network
:param scope: TF scope for value network.
"""
self.value_heads = {}
with tf.variable_scope(scope):
value_hidden = self.create_vector_observation_encoder(
hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
Expand Down Expand Up @@ -676,6 +707,12 @@ def __init__(
if num_layers < 1:
num_layers = 1

self.target_init_op: List[tf.Tensor] = []
self.target_update_op: List[tf.Tensor] = []
self.update_batch_policy: Optional[tf.Operation] = None
self.update_batch_value: Optional[tf.Operation] = None
self.update_batch_entropy: Optional[tf.Operation] = None

self.policy_network = SACPolicyNetwork(
brain=brain,
m_size=m_size,
Expand Down
7 changes: 6 additions & 1 deletion ml-agents/mlagents/trainers/sac/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,12 @@ def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]:

@timed
def update(
self, mini_batch: Dict[str, Any], num_sequences: int, update_target: bool = True
# pylint: disable=arguments-differ
# TODO ervteng FIX ME
self,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ervteng Can you take a look at this? update_target is not in the base class signature. I only did a quick search, but it looked like we only ever call this with update_target=True but wasn't 100% sure if it was safe to remove

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this is a subtlety of the SAC algorithm. There are two value networks, a target and a current one. The target is used to bootstrap the current one, and is updated periodically towards the weights of the current.

Most implementations of SAC let you vary when this update happens. Initially, I had an additional hyperparameter that did just that, but removed it b/c realistically you can get away with just adjusting tau (and less hyperparameters are better for most users).

I think it's safe to remove it for now. A researcher might want to vary this, but then again a researcher can edit the Python code. Perhaps we can add a comment -> I'll add it below.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line 213: # Update target network. By default, target update happens at every policy update.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, I'll apply that. If you want to keep the flexibility, you could so something like move all the current update() to _update(), and make update() call _update() with update_target=True (or some other logic)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 I'll probably do something like this when we rehaul the Policy and Trainer APIs, but will hold off for this particular version of the code.

mini_batch: Dict[str, Any],
num_sequences: int,
update_target: bool = True,
) -> Dict[str, float]:
"""
Updates model using buffer.
Expand Down
10 changes: 5 additions & 5 deletions ml-agents/mlagents/trainers/sac/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import logging
from collections import defaultdict
from typing import List, Dict
from typing import Dict
import os

import numpy as np
Expand Down Expand Up @@ -159,14 +159,14 @@ def add_rewards_outputs(
)

def process_experiences(
self, current_info: AllBrainInfo, new_info: AllBrainInfo
self, current_info: AllBrainInfo, next_info: AllBrainInfo
) -> None:
"""
Checks agent histories for processing condition, and processes them as necessary.
:param current_info: Dictionary of all current brains and corresponding BrainInfo.
:param new_info: Dictionary of all next brains and corresponding BrainInfo.
:param next_info: Dictionary of all next brains and corresponding BrainInfo.
"""
info = new_info[self.brain_name]
info = next_info[self.brain_name]
if self.is_training:
self.policy.update_normalization(info.vector_observations)
for l in range(len(info.agents)):
Expand Down Expand Up @@ -254,7 +254,7 @@ def update_sac_policy(self) -> None:
is greater than 1 and the reward signals are not updated in parallel.
"""

self.cumulative_returns_since_policy_update: List[float] = []
self.cumulative_returns_since_policy_update.clear()
n_sequences = max(
int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
)
Expand Down
4 changes: 1 addition & 3 deletions ml-agents/mlagents/trainers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,7 @@ def add_experiences(
:param next_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param take_action_outputs: The outputs of the Policy's get_action method.
"""
raise UnityTrainerException(
"The process_experiences method was not implemented."
)
raise UnityTrainerException("The add_experiences method was not implemented.")

def process_experiences(
self, current_info: AllBrainInfo, next_info: AllBrainInfo
Expand Down