Unity-Technologies · chriselion · Nov 4, 2019 · Nov 2, 2019 · Nov 2, 2019 · Nov 2, 2019
diff --git a/.pylintrc b/.pylintrc
@@ -12,9 +12,10 @@ disable =
     # disabled because black handles this
     C0301,C0330,
 
-    # C0115: Missing class docstring
     # C0114: Missing module docstring
-    C0115,C0114,
+    # C0115: Missing class docstring
+    # C0116: Missing function or method docstring
+    C0114,C0115,C0116,
 
     # All convention and refactor for now
     C,R,
@@ -30,19 +31,12 @@ disable =
     # W0107: Unnecessary pass statement
     W0107,
 
-    # W0511 TODO
+    # W0511 "TODO"
     W0511,
 
     # W0703: Catching too general exception Exception
     W0703,
 
-    # W0201: Attribute '...' defined outside __init__
-    W0201,
-
-    # We should fix these up ASAP
-    # W0221: Parameters differ from overridden
-    W0221,
-
     # E0401: Unable to import...
     # E0611: No name '...' in module '...'
     # need to look into these, probably namespace packages

diff --git a/ml-agents-envs/mlagents/envs/base_unity_environment.py b/ml-agents-envs/mlagents/envs/base_unity_environment.py
@@ -9,9 +9,9 @@ class BaseUnityEnvironment(ABC):
     def step(
         self,
         vector_action: Optional[Dict] = None,
-        memory: Optional[Dict] = None,
         text_action: Optional[Dict] = None,
         value: Optional[Dict] = None,
+        custom_action: Dict[str, Any] = None,
     ) -> AllBrainInfo:
         pass
 

diff --git a/ml-agents-envs/mlagents/envs/env_manager.py b/ml-agents-envs/mlagents/envs/env_manager.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Dict, NamedTuple, Optional
+from typing import Any, List, Dict, NamedTuple, Optional
 from mlagents.envs.brain import AllBrainInfo, BrainParameters
 from mlagents.envs.policy import Policy
 from mlagents.envs.action_info import ActionInfo
@@ -24,7 +24,10 @@ def step(self) -> List[EnvironmentStep]:
 
     @abstractmethod
     def reset(
-        self, config: Dict = None, train_mode: bool = True
+        self,
+        config: Dict = None,
+        train_mode: bool = True,
+        custom_reset_parameters: Any = None,
     ) -> List[EnvironmentStep]:
         pass
 

diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py
@@ -45,20 +45,20 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
 
     def add_experiences(
         self,
-        curr_info: AllBrainInfo,
-        next_info: AllBrainInfo,
+        curr_all_info: AllBrainInfo,
+        next_all_info: AllBrainInfo,
         take_action_outputs: ActionInfoOutputs,
     ) -> None:
         """
         Adds experiences to each agent's experience history.
-        :param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
-        :param next_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
+        :param curr_all_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
+        :param next_all_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
         :param take_action_outputs: The outputs of the take action method.
         """
 
         # Used to collect information about student performance.
-        info_student = curr_info[self.brain_name]
-        next_info_student = next_info[self.brain_name]
+        info_student = curr_all_info[self.brain_name]
+        next_info_student = next_all_info[self.brain_name]
         for agent_id in info_student.agents:
             self.evaluation_buffer[agent_id].last_brain_info = info_student
 

diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
@@ -1,4 +1,4 @@
-from typing import Tuple, List
+from typing import List, Optional, Tuple
 
 import tensorflow as tf
 from mlagents.trainers.models import LearningModel
@@ -37,6 +37,10 @@ def __init__(
         self.gradient_penalty_weight = gradient_penalty_weight
         self.use_vail = use_vail
         self.use_actions = use_actions  # True # Not using actions
+
+        self.noise: Optional[tf.Tensor] = None
+        self.z: Optional[tf.Tensor] = None
+
         self.make_inputs()
         self.create_network()
         self.create_loss(learning_rate)

diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
@@ -126,7 +126,7 @@ def check_config(
     def prepare_update(
         self,
         policy_model: LearningModel,
-        mini_batch_policy: Dict[str, np.ndarray],
+        mini_batch: Dict[str, np.ndarray],
         num_sequences: int,
     ) -> Dict[tf.Tensor, Any]:
         """
@@ -136,42 +136,40 @@ def prepare_update(
         :return: Feed_dict for update process.
         """
         max_num_experiences = min(
-            len(mini_batch_policy["actions"]),
+            len(mini_batch["actions"]),
             len(self.demonstration_buffer.update_buffer["actions"]),
         )
         # If num_sequences is less, we need to shorten the input batch.
-        for key, element in mini_batch_policy.items():
-            mini_batch_policy[key] = element[:max_num_experiences]
+        for key, element in mini_batch.items():
+            mini_batch[key] = element[:max_num_experiences]
 
         # Get batch from demo buffer
         mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch(
-            len(mini_batch_policy["actions"]), 1
+            len(mini_batch["actions"]), 1
         )
 
         feed_dict: Dict[tf.Tensor, Any] = {
             self.model.done_expert_holder: mini_batch_demo["done"],
-            self.model.done_policy_holder: mini_batch_policy["done"],
+            self.model.done_policy_holder: mini_batch["done"],
         }
 
         if self.model.use_vail:
             feed_dict[self.model.use_noise] = [1]
 
         feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
         if self.policy.use_continuous_act:
-            feed_dict[policy_model.selected_actions] = mini_batch_policy["actions"]
+            feed_dict[policy_model.selected_actions] = mini_batch["actions"]
         else:
-            feed_dict[policy_model.action_holder] = mini_batch_policy["actions"]
+            feed_dict[policy_model.action_holder] = mini_batch["actions"]
 
         if self.policy.use_vis_obs > 0:
             for i in range(len(policy_model.visual_in)):
-                feed_dict[policy_model.visual_in[i]] = mini_batch_policy[
-                    "visual_obs%d" % i
-                ]
+                feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
                 feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[
                     "visual_obs%d" % i
                 ]
         if self.policy.use_vec_obs:
-            feed_dict[policy_model.vector_in] = mini_batch_policy["vector_obs"]
+            feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
             feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"]
         self.has_updated = True
         return feed_dict
diff --git a/ml-agents/mlagents/trainers/models.py b/ml-agents/mlagents/trainers/models.py
@@ -1,6 +1,6 @@
 import logging
 from enum import Enum
-from typing import Callable, List
+from typing import Callable, Dict, List, Optional
 
 import numpy as np
 import tensorflow as tf
@@ -85,6 +85,12 @@ def __init__(
                 trainable=False,
                 dtype=tf.int32,
             )
+        self.value_heads: Dict[str, tf.Tensor] = {}
+        self.normalization_steps: Optional[tf.Variable] = None
+        self.running_mean: Optional[tf.Variable] = None
+        self.running_variance: Optional[tf.Variable] = None
+        self.update_normalization: Optional[tf.Operation] = None
+        self.value: Optional[tf.Tensor] = None
 
     @staticmethod
     def create_global_steps():
@@ -573,7 +579,6 @@ def create_value_heads(self, stream_names, hidden_input):
         :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
         of the hidden input.
         """
-        self.value_heads = {}
         for name in stream_names:
             value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
             self.value_heads[name] = value

diff --git a/ml-agents/mlagents/trainers/ppo/models.py b/ml-agents/mlagents/trainers/ppo/models.py
@@ -1,7 +1,9 @@
 import logging
-import numpy as np
+from typing import Optional
 
+import numpy as np
 import tensorflow as tf
+
 from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule
 
 logger = logging.getLogger("mlagents.trainers")
@@ -46,6 +48,11 @@ def __init__(
         LearningModel.__init__(
             self, m_size, normalize, use_recurrent, brain, seed, stream_names
         )
+
+        self.optimizer: Optional[tf.train.AdamOptimizer] = None
+        self.grads = None
+        self.update_batch: Optional[tf.Operation] = None
+
         if num_layers < 1:
             num_layers = 1
         if brain.vector_action_space_type == "continuous":

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -79,15 +79,15 @@ def __init__(
             self.collected_rewards[_reward_signal] = {}
 
     def process_experiences(
-        self, current_info: AllBrainInfo, new_info: AllBrainInfo
+        self, current_info: AllBrainInfo, next_info: AllBrainInfo
     ) -> None:
         """
         Checks agent histories for processing condition, and processes them as necessary.
         Processing involves calculating value and advantage targets for model updating step.
         :param current_info: Dictionary of all current brains and corresponding BrainInfo.
-        :param new_info: Dictionary of all next brains and corresponding BrainInfo.
+        :param next_info: Dictionary of all next brains and corresponding BrainInfo.
         """
-        info = new_info[self.brain_name]
+        info = next_info[self.brain_name]
         if self.is_training:
             self.policy.update_normalization(info.vector_observations)
         for l in range(len(info.agents)):
@@ -228,7 +228,7 @@ def update_policy(self):
             number_experiences=buffer_length,
             mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),
         )
-        self.cumulative_returns_since_policy_update = []
+        self.cumulative_returns_since_policy_update.clear()
 
         # Make sure batch_size is a multiple of sequence length. During training, we
         # will need to reshape the data into a batch_size x sequence_length tensor.

diff --git a/ml-agents/mlagents/trainers/rl_trainer.py b/ml-agents/mlagents/trainers/rl_trainer.py
@@ -248,7 +248,7 @@ def add_policy_outputs(
         :param agent_idx: the index of the Agent agent_id
         """
         raise UnityTrainerException(
-            "The process_experiences method was not implemented."
+            "The add_policy_outputs method was not implemented."
         )
 
     def add_rewards_outputs(
@@ -270,5 +270,5 @@ def add_rewards_outputs(
         :param agent_next_idx: the index of the Agent agent_id in the next brain info
         """
         raise UnityTrainerException(
-            "The process_experiences method was not implemented."
+            "The add_rewards_outputs method was not implemented."
         )
diff --git a/ml-agents/mlagents/trainers/sac/models.py b/ml-agents/mlagents/trainers/sac/models.py
@@ -1,6 +1,6 @@
 import logging
 import numpy as np
-from typing import Optional
+from typing import Dict, List, Optional
 
 import tensorflow as tf
 from mlagents.trainers.models import LearningModel, LearningRateSchedule, EncoderType
@@ -46,9 +46,41 @@ def __init__(
         self.activ_fn = self.swish
 
         self.policy_memory_in: Optional[tf.Tensor] = None
+        self.policy_memory_out: Optional[tf.Tensor] = None
         self.value_memory_in: Optional[tf.Tensor] = None
+        self.value_memory_out: Optional[tf.Tensor] = None
+        self.q1: Optional[tf.Tensor] = None
+        self.q2: Optional[tf.Tensor] = None
+        self.q1_p: Optional[tf.Tensor] = None
+        self.q2_p: Optional[tf.Tensor] = None
         self.q1_memory_in: Optional[tf.Tensor] = None
         self.q2_memory_in: Optional[tf.Tensor] = None
+        self.q1_memory_out: Optional[tf.Tensor] = None
+        self.q2_memory_out: Optional[tf.Tensor] = None
+        self.action_holder: Optional[tf.Tensor] = None
+        self.prev_action: Optional[tf.Tensor] = None
+        self.action_masks: Optional[tf.Tensor] = None
+        self.external_action_in: Optional[tf.Tensor] = None
+        self.log_sigma_sq: Optional[tf.Tensor] = None
+        self.entropy: Optional[tf.Tensor] = None
+        self.deterministic_output: Optional[tf.Tensor] = None
+        self.all_log_probs: Optional[tf.Tensor] = None
+        self.normalized_logprobs: Optional[tf.Tensor] = None
+        self.action_probs: Optional[tf.Tensor] = None
+        self.selected_actions: Optional[tf.Tensor] = None
+        self.output: Optional[tf.Tensor] = None
+        self.output_oh: Optional[tf.Tensor] = None
+        self.output_pre: Optional[tf.Tensor] = None
+
+        self.value_vars = None
+        self.q_vars = None
+        self.critic_vars = None
+        self.policy_vars = None
+
+        self.q1_heads: Optional[Dict[str, tf.Tensor]] = None
+        self.q2_heads: Optional[Dict[str, tf.Tensor]] = None
+        self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None
+        self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None
 
     def get_vars(self, scope):
         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
@@ -350,7 +382,6 @@ def create_sac_value_head(
         :param h_size: size of hidden layers for value network
         :param scope: TF scope for value network.
         """
-        self.value_heads = {}
         with tf.variable_scope(scope):
             value_hidden = self.create_vector_observation_encoder(
                 hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
@@ -676,6 +707,12 @@ def __init__(
         if num_layers < 1:
             num_layers = 1
 
+        self.target_init_op: List[tf.Tensor] = []
+        self.target_update_op: List[tf.Tensor] = []
+        self.update_batch_policy: Optional[tf.Operation] = None
+        self.update_batch_value: Optional[tf.Operation] = None
+        self.update_batch_entropy: Optional[tf.Operation] = None
+
         self.policy_network = SACPolicyNetwork(
             brain=brain,
             m_size=m_size,

diff --git a/ml-agents/mlagents/trainers/sac/policy.py b/ml-agents/mlagents/trainers/sac/policy.py
@@ -188,7 +188,12 @@ def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]:
 
     @timed
     def update(
-        self, mini_batch: Dict[str, Any], num_sequences: int, update_target: bool = True
+        # pylint: disable=arguments-differ
+        # TODO ervteng FIX ME
+        self,
+        mini_batch: Dict[str, Any],
+        num_sequences: int,
+        update_target: bool = True,
     ) -> Dict[str, float]:
         """
         Updates model using buffer.

diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -5,7 +5,7 @@
 
 import logging
 from collections import defaultdict
-from typing import List, Dict
+from typing import Dict
 import os
 
 import numpy as np
@@ -159,14 +159,14 @@ def add_rewards_outputs(
         )
 
     def process_experiences(
-        self, current_info: AllBrainInfo, new_info: AllBrainInfo
+        self, current_info: AllBrainInfo, next_info: AllBrainInfo
     ) -> None:
         """
         Checks agent histories for processing condition, and processes them as necessary.
         :param current_info: Dictionary of all current brains and corresponding BrainInfo.
-        :param new_info: Dictionary of all next brains and corresponding BrainInfo.
+        :param next_info: Dictionary of all next brains and corresponding BrainInfo.
         """
-        info = new_info[self.brain_name]
+        info = next_info[self.brain_name]
         if self.is_training:
             self.policy.update_normalization(info.vector_observations)
         for l in range(len(info.agents)):
@@ -254,7 +254,7 @@ def update_sac_policy(self) -> None:
         is greater than 1 and the reward signals are not updated in parallel.
         """
 
-        self.cumulative_returns_since_policy_update: List[float] = []
+        self.cumulative_returns_since_policy_update.clear()
         n_sequences = max(
             int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
         )

diff --git a/ml-agents/mlagents/trainers/trainer.py b/ml-agents/mlagents/trainers/trainer.py
@@ -246,9 +246,7 @@ def add_experiences(
         :param next_all_info: Dictionary of all current brains and corresponding BrainInfo.
         :param take_action_outputs: The outputs of the Policy's get_action method.
         """
-        raise UnityTrainerException(
-            "The process_experiences method was not implemented."
-        )
+        raise UnityTrainerException("The add_experiences method was not implemented.")
 
     def process_experiences(
         self, current_info: AllBrainInfo, next_info: AllBrainInfo