From ffba52f64a029f2d765441110b9287418aab0dee Mon Sep 17 00:00:00 2001
From: Chris Elion <chris.elion@unity3d.com>
Date: Sat, 2 Nov 2019 14:50:51 -0700
Subject: [PATCH 1/6] more pylint cleanup

---
 .pylintrc                                     |  2 +-
 .../mlagents/envs/base_unity_environment.py   |  2 +-
 ml-agents-envs/mlagents/envs/env_manager.py   |  7 ++++--
 ml-agents/mlagents/trainers/bc/trainer.py     | 12 +++++-----
 .../components/reward_signals/gail/signal.py  | 22 +++++++++----------
 ml-agents/mlagents/trainers/ppo/trainer.py    |  8 +++----
 ml-agents/mlagents/trainers/rl_trainer.py     |  4 ++--
 ml-agents/mlagents/trainers/sac/trainer.py    | 10 ++++-----
 ml-agents/mlagents/trainers/trainer.py        |  4 +---
 9 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 3c82fafc31..4e4646f722 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -41,7 +41,7 @@ disable =
 
     # We should fix these up ASAP
     # W0221: Parameters differ from overridden
-    W0221,
+    #W0221,
 
     # E0401: Unable to import...
     # E0611: No name '...' in module '...'
diff --git a/ml-agents-envs/mlagents/envs/base_unity_environment.py b/ml-agents-envs/mlagents/envs/base_unity_environment.py
index b588f31cb9..72e90cd042 100644
--- a/ml-agents-envs/mlagents/envs/base_unity_environment.py
+++ b/ml-agents-envs/mlagents/envs/base_unity_environment.py
@@ -9,9 +9,9 @@ class BaseUnityEnvironment(ABC):
     def step(
         self,
         vector_action: Optional[Dict] = None,
-        memory: Optional[Dict] = None,
         text_action: Optional[Dict] = None,
         value: Optional[Dict] = None,
+        custom_action: Dict[str, Any] = None,
     ) -> AllBrainInfo:
         pass
 
diff --git a/ml-agents-envs/mlagents/envs/env_manager.py b/ml-agents-envs/mlagents/envs/env_manager.py
index 8f3499104e..48d4aab92b 100644
--- a/ml-agents-envs/mlagents/envs/env_manager.py
+++ b/ml-agents-envs/mlagents/envs/env_manager.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Dict, NamedTuple, Optional
+from typing import Any, List, Dict, NamedTuple, Optional
 from mlagents.envs.brain import AllBrainInfo, BrainParameters
 from mlagents.envs.policy import Policy
 from mlagents.envs.action_info import ActionInfo
@@ -24,7 +24,10 @@ def step(self) -> List[EnvironmentStep]:
 
     @abstractmethod
     def reset(
-        self, config: Dict = None, train_mode: bool = True
+        self,
+        config: Dict = None,
+        train_mode: bool = True,
+        custom_reset_parameters: Any = None,
     ) -> List[EnvironmentStep]:
         pass
 
diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py
index 4bce079748..cf602fdbde 100644
--- a/ml-agents/mlagents/trainers/bc/trainer.py
+++ b/ml-agents/mlagents/trainers/bc/trainer.py
@@ -45,20 +45,20 @@ def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
 
     def add_experiences(
         self,
-        curr_info: AllBrainInfo,
-        next_info: AllBrainInfo,
+        curr_all_info: AllBrainInfo,
+        next_all_info: AllBrainInfo,
         take_action_outputs: ActionInfoOutputs,
     ) -> None:
         """
         Adds experiences to each agent's experience history.
-        :param curr_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
-        :param next_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
+        :param curr_all_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
+        :param next_all_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
         :param take_action_outputs: The outputs of the take action method.
         """
 
         # Used to collect information about student performance.
-        info_student = curr_info[self.brain_name]
-        next_info_student = next_info[self.brain_name]
+        info_student = curr_all_info[self.brain_name]
+        next_info_student = next_all_info[self.brain_name]
         for agent_id in info_student.agents:
             self.evaluation_buffer[agent_id].last_brain_info = info_student
 
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
index 5539805be5..14299a3d0f 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
@@ -126,7 +126,7 @@ def check_config(
     def prepare_update(
         self,
         policy_model: LearningModel,
-        mini_batch_policy: Dict[str, np.ndarray],
+        mini_batch: Dict[str, np.ndarray],
         num_sequences: int,
     ) -> Dict[tf.Tensor, Any]:
         """
@@ -136,21 +136,21 @@ def prepare_update(
         :return: Feed_dict for update process.
         """
         max_num_experiences = min(
-            len(mini_batch_policy["actions"]),
+            len(mini_batch["actions"]),
             len(self.demonstration_buffer.update_buffer["actions"]),
         )
         # If num_sequences is less, we need to shorten the input batch.
-        for key, element in mini_batch_policy.items():
-            mini_batch_policy[key] = element[:max_num_experiences]
+        for key, element in mini_batch.items():
+            mini_batch[key] = element[:max_num_experiences]
 
         # Get batch from demo buffer
         mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch(
-            len(mini_batch_policy["actions"]), 1
+            len(mini_batch["actions"]), 1
         )
 
         feed_dict: Dict[tf.Tensor, Any] = {
             self.model.done_expert_holder: mini_batch_demo["done"],
-            self.model.done_policy_holder: mini_batch_policy["done"],
+            self.model.done_policy_holder: mini_batch["done"],
         }
 
         if self.model.use_vail:
@@ -158,20 +158,18 @@ def prepare_update(
 
         feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
         if self.policy.use_continuous_act:
-            feed_dict[policy_model.selected_actions] = mini_batch_policy["actions"]
+            feed_dict[policy_model.selected_actions] = mini_batch["actions"]
         else:
-            feed_dict[policy_model.action_holder] = mini_batch_policy["actions"]
+            feed_dict[policy_model.action_holder] = mini_batch["actions"]
 
         if self.policy.use_vis_obs > 0:
             for i in range(len(policy_model.visual_in)):
-                feed_dict[policy_model.visual_in[i]] = mini_batch_policy[
-                    "visual_obs%d" % i
-                ]
+                feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
                 feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[
                     "visual_obs%d" % i
                 ]
         if self.policy.use_vec_obs:
-            feed_dict[policy_model.vector_in] = mini_batch_policy["vector_obs"]
+            feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
             feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"]
         self.has_updated = True
         return feed_dict
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index 19d6a17eff..017072413a 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -79,15 +79,15 @@ def __init__(
             self.collected_rewards[_reward_signal] = {}
 
     def process_experiences(
-        self, current_info: AllBrainInfo, new_info: AllBrainInfo
+        self, current_info: AllBrainInfo, next_info: AllBrainInfo
     ) -> None:
         """
         Checks agent histories for processing condition, and processes them as necessary.
         Processing involves calculating value and advantage targets for model updating step.
         :param current_info: Dictionary of all current brains and corresponding BrainInfo.
-        :param new_info: Dictionary of all next brains and corresponding BrainInfo.
+        :param next_info: Dictionary of all next brains and corresponding BrainInfo.
         """
-        info = new_info[self.brain_name]
+        info = next_info[self.brain_name]
         if self.is_training:
             self.policy.update_normalization(info.vector_observations)
         for l in range(len(info.agents)):
@@ -228,7 +228,7 @@ def update_policy(self):
             number_experiences=buffer_length,
             mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),
         )
-        self.cumulative_returns_since_policy_update = []
+        self.cumulative_returns_since_policy_update.clear()
 
         # Make sure batch_size is a multiple of sequence length. During training, we
         # will need to reshape the data into a batch_size x sequence_length tensor.
diff --git a/ml-agents/mlagents/trainers/rl_trainer.py b/ml-agents/mlagents/trainers/rl_trainer.py
index 9cef6fb877..870cf3f9ff 100644
--- a/ml-agents/mlagents/trainers/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/rl_trainer.py
@@ -248,7 +248,7 @@ def add_policy_outputs(
         :param agent_idx: the index of the Agent agent_id
         """
         raise UnityTrainerException(
-            "The process_experiences method was not implemented."
+            "The add_policy_outputs method was not implemented."
         )
 
     def add_rewards_outputs(
@@ -270,5 +270,5 @@ def add_rewards_outputs(
         :param agent_next_idx: the index of the Agent agent_id in the next brain info
         """
         raise UnityTrainerException(
-            "The process_experiences method was not implemented."
+            "The add_rewards_outputs method was not implemented."
         )
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index c82a47e521..df3b8b2c9b 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -5,7 +5,7 @@
 
 import logging
 from collections import defaultdict
-from typing import List, Dict
+from typing import Dict
 import os
 
 import numpy as np
@@ -159,14 +159,14 @@ def add_rewards_outputs(
         )
 
     def process_experiences(
-        self, current_info: AllBrainInfo, new_info: AllBrainInfo
+        self, current_info: AllBrainInfo, next_info: AllBrainInfo
     ) -> None:
         """
         Checks agent histories for processing condition, and processes them as necessary.
         :param current_info: Dictionary of all current brains and corresponding BrainInfo.
-        :param new_info: Dictionary of all next brains and corresponding BrainInfo.
+        :param next_info: Dictionary of all next brains and corresponding BrainInfo.
         """
-        info = new_info[self.brain_name]
+        info = next_info[self.brain_name]
         if self.is_training:
             self.policy.update_normalization(info.vector_observations)
         for l in range(len(info.agents)):
@@ -254,7 +254,7 @@ def update_sac_policy(self) -> None:
         is greater than 1 and the reward signals are not updated in parallel.
         """
 
-        self.cumulative_returns_since_policy_update: List[float] = []
+        self.cumulative_returns_since_policy_update.clear()
         n_sequences = max(
             int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
         )
diff --git a/ml-agents/mlagents/trainers/trainer.py b/ml-agents/mlagents/trainers/trainer.py
index cc239c8668..796085368e 100644
--- a/ml-agents/mlagents/trainers/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer.py
@@ -246,9 +246,7 @@ def add_experiences(
         :param next_all_info: Dictionary of all current brains and corresponding BrainInfo.
         :param take_action_outputs: The outputs of the Policy's get_action method.
         """
-        raise UnityTrainerException(
-            "The process_experiences method was not implemented."
-        )
+        raise UnityTrainerException("The add_experiences method was not implemented.")
 
     def process_experiences(
         self, current_info: AllBrainInfo, next_info: AllBrainInfo

From 6b4bc1fc2134167a331ebc954e8ea3b9bb8cef6f Mon Sep 17 00:00:00 2001
From: Chris Elion <chris.elion@unity3d.com>
Date: Sat, 2 Nov 2019 15:32:20 -0700
Subject: [PATCH 2/6] define some tf objects in inits

---
 .pylintrc                                     |  6 +--
 .../components/reward_signals/gail/model.py   |  6 ++-
 ml-agents/mlagents/trainers/models.py         |  9 +++-
 ml-agents/mlagents/trainers/sac/models.py     | 41 ++++++++++++++++++-
 4 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 4e4646f722..8274f6e261 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -37,11 +37,7 @@ disable =
     W0703,
 
     # W0201: Attribute '...' defined outside __init__
-    W0201,
-
-    # We should fix these up ASAP
-    # W0221: Parameters differ from overridden
-    #W0221,
+    #W0201,
 
     # E0401: Unable to import...
     # E0611: No name '...' in module '...'
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
index 7d106ddd97..0a8e3b711b 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
@@ -1,4 +1,4 @@
-from typing import Tuple, List
+from typing import List, Optional, Tuple
 
 import tensorflow as tf
 from mlagents.trainers.models import LearningModel
@@ -37,6 +37,10 @@ def __init__(
         self.gradient_penalty_weight = gradient_penalty_weight
         self.use_vail = use_vail
         self.use_actions = use_actions  # True # Not using actions
+
+        self.noise: Optional[tf.Tensor] = None
+        self.z: Optional[tf.Tensor] = None
+
         self.make_inputs()
         self.create_network()
         self.create_loss(learning_rate)
diff --git a/ml-agents/mlagents/trainers/models.py b/ml-agents/mlagents/trainers/models.py
index 3896fbb567..70d779bb10 100644
--- a/ml-agents/mlagents/trainers/models.py
+++ b/ml-agents/mlagents/trainers/models.py
@@ -1,6 +1,6 @@
 import logging
 from enum import Enum
-from typing import Callable, List
+from typing import Callable, Dict, List, Optional
 
 import numpy as np
 import tensorflow as tf
@@ -85,6 +85,12 @@ def __init__(
                 trainable=False,
                 dtype=tf.int32,
             )
+        self.value_heads: Dict[str, tf.Tensor] = {}
+        self.normalization_steps: Optional[tf.Variable] = None
+        self.running_mean: Optional[tf.Variable] = None
+        self.running_variance: Optional[tf.Variable] = None
+        self.update_normalization: Optional[tf.Operation] = None
+        self.value: Optional[tf.Tensor] = None
 
     @staticmethod
     def create_global_steps():
@@ -573,7 +579,6 @@ def create_value_heads(self, stream_names, hidden_input):
         :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
         of the hidden input.
         """
-        self.value_heads = {}
         for name in stream_names:
             value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
             self.value_heads[name] = value
diff --git a/ml-agents/mlagents/trainers/sac/models.py b/ml-agents/mlagents/trainers/sac/models.py
index 390eb04ed9..1dd54e25de 100644
--- a/ml-agents/mlagents/trainers/sac/models.py
+++ b/ml-agents/mlagents/trainers/sac/models.py
@@ -1,6 +1,6 @@
 import logging
 import numpy as np
-from typing import Optional
+from typing import Dict, List, Optional
 
 import tensorflow as tf
 from mlagents.trainers.models import LearningModel, LearningRateSchedule, EncoderType
@@ -46,9 +46,41 @@ def __init__(
         self.activ_fn = self.swish
 
         self.policy_memory_in: Optional[tf.Tensor] = None
+        self.policy_memory_out: Optional[tf.Tensor] = None
         self.value_memory_in: Optional[tf.Tensor] = None
+        self.value_memory_out: Optional[tf.Tensor] = None
+        self.q1: Optional[tf.Tensor] = None
+        self.q2: Optional[tf.Tensor] = None
+        self.q1_p: Optional[tf.Tensor] = None
+        self.q2_p: Optional[tf.Tensor] = None
         self.q1_memory_in: Optional[tf.Tensor] = None
         self.q2_memory_in: Optional[tf.Tensor] = None
+        self.q1_memory_out: Optional[tf.Tensor] = None
+        self.q2_memory_out: Optional[tf.Tensor] = None
+        self.action_holder: Optional[tf.Tensor] = None
+        self.prev_action: Optional[tf.Tensor] = None
+        self.action_masks: Optional[tf.Tensor] = None
+        self.external_action_in: Optional[tf.Tensor] = None
+        self.log_sigma_sq: Optional[tf.Tensor] = None
+        self.entropy: Optional[tf.Tensor] = None
+        self.deterministic_output: Optional[tf.Tensor] = None
+        self.all_log_probs: Optional[tf.Tensor] = None
+        self.normalized_logprobs: Optional[tf.Tensor] = None
+        self.action_probs: Optional[tf.Tensor] = None
+        self.selected_actions: Optional[tf.Tensor] = None
+        self.output: Optional[tf.Tensor] = None
+        self.output_oh: Optional[tf.Tensor] = None
+        self.output_pre: Optional[tf.Tensor] = None
+
+        self.value_vars = None
+        self.q_vars = None
+        self.critic_vars = None
+        self.policy_vars = None
+
+        self.q1_heads: Optional[Dict[str, tf.Tensor]] = None
+        self.q2_heads: Optional[Dict[str, tf.Tensor]] = None
+        self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None
+        self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None
 
     def get_vars(self, scope):
         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
@@ -350,7 +382,6 @@ def create_sac_value_head(
         :param h_size: size of hidden layers for value network
         :param scope: TF scope for value network.
         """
-        self.value_heads = {}
         with tf.variable_scope(scope):
             value_hidden = self.create_vector_observation_encoder(
                 hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
@@ -676,6 +707,12 @@ def __init__(
         if num_layers < 1:
             num_layers = 1
 
+        self.target_init_op: List[tf.Tensor] = []
+        self.target_update_op: List[tf.Tensor] = []
+        self.update_batch_policy: Optional[tf.Operation] = None
+        self.update_batch_value: Optional[tf.Operation] = None
+        self.update_batch_entropy: Optional[tf.Operation] = None
+
         self.policy_network = SACPolicyNetwork(
             brain=brain,
             m_size=m_size,

From 40a0917661b3ec3c1dc2255cd8ca8158f1b302a4 Mon Sep 17 00:00:00 2001
From: Chris Elion <chris.elion@unity3d.com>
Date: Sat, 2 Nov 2019 15:42:38 -0700
Subject: [PATCH 3/6] more fixup

---
 ml-agents/mlagents/trainers/ppo/models.py | 9 ++++++++-
 ml-agents/mlagents/trainers/sac/policy.py | 7 ++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/ml-agents/mlagents/trainers/ppo/models.py b/ml-agents/mlagents/trainers/ppo/models.py
index deac591129..06cf0dadc5 100644
--- a/ml-agents/mlagents/trainers/ppo/models.py
+++ b/ml-agents/mlagents/trainers/ppo/models.py
@@ -1,7 +1,9 @@
 import logging
-import numpy as np
+from typing import Optional
 
+import numpy as np
 import tensorflow as tf
+
 from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule
 
 logger = logging.getLogger("mlagents.trainers")
@@ -46,6 +48,11 @@ def __init__(
         LearningModel.__init__(
             self, m_size, normalize, use_recurrent, brain, seed, stream_names
         )
+
+        self.optimizer: Optional[tf.train.AdamOptimizer] = None
+        self.grads = None
+        self.update_batch: Optional[tf.Operation] = None
+
         if num_layers < 1:
             num_layers = 1
         if brain.vector_action_space_type == "continuous":
diff --git a/ml-agents/mlagents/trainers/sac/policy.py b/ml-agents/mlagents/trainers/sac/policy.py
index ef13184f7e..02ba111947 100644
--- a/ml-agents/mlagents/trainers/sac/policy.py
+++ b/ml-agents/mlagents/trainers/sac/policy.py
@@ -188,7 +188,12 @@ def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]:
 
     @timed
     def update(
-        self, mini_batch: Dict[str, Any], num_sequences: int, update_target: bool = True
+        # pylint: disable=arguments-differ
+        # TODO ervteng FIX ME
+        self,
+        mini_batch: Dict[str, Any],
+        num_sequences: int,
+        update_target: bool = True,
     ) -> Dict[str, float]:
         """
         Updates model using buffer.

From b692198464421226c6548257e09cc1cda410daa6 Mon Sep 17 00:00:00 2001
From: Chris Elion <chris.elion@unity3d.com>
Date: Sat, 2 Nov 2019 16:02:24 -0700
Subject: [PATCH 4/6] pylint settings (fails without require_serial)

---
 .pre-commit-config.yaml |  1 -
 .pylintrc               | 10 ++++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2a1861765c..f083483106 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -66,7 +66,6 @@ repos:
                 .*_pb2_grpc.py|
                 .*/tests/.*
             )$
-        require_serial: true
 
 # "Local" hooks, see https://pre-commit.com/#repository-local-hooks
 -   repo: local
diff --git a/.pylintrc b/.pylintrc
index 8274f6e261..11d1f0b75c 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -12,9 +12,10 @@ disable =
     # disabled because black handles this
     C0301,C0330,
 
-    # C0115: Missing class docstring
     # C0114: Missing module docstring
-    C0115,C0114,
+    # C0115: Missing class docstring
+    # C0116: Missing function or method docstring
+    C0114,C0115,C0116,
 
     # All convention and refactor for now
     C,R,
@@ -30,15 +31,12 @@ disable =
     # W0107: Unnecessary pass statement
     W0107,
 
-    # W0511 TODO
+    # W0511 "TODO"
     W0511,
 
     # W0703: Catching too general exception Exception
     W0703,
 
-    # W0201: Attribute '...' defined outside __init__
-    #W0201,
-
     # E0401: Unable to import...
     # E0611: No name '...' in module '...'
     # need to look into these, probably namespace packages

From f95f92a30e0bb88bf5dd726191a69a562567dc26 Mon Sep 17 00:00:00 2001
From: Chris Elion <chris.elion@unity3d.com>
Date: Sat, 2 Nov 2019 16:12:19 -0700
Subject: [PATCH 5/6] re-enable require_serial

---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f083483106..2a1861765c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -66,6 +66,7 @@ repos:
                 .*_pb2_grpc.py|
                 .*/tests/.*
             )$
+        require_serial: true
 
 # "Local" hooks, see https://pre-commit.com/#repository-local-hooks
 -   repo: local

From dffe80ec5b28faf156bf00720c8942df5fc2b04b Mon Sep 17 00:00:00 2001
From: Chris Elion <chris.elion@unity3d.com>
Date: Mon, 4 Nov 2019 09:20:13 -0800
Subject: [PATCH 6/6] make update() match parent

---
 ml-agents/mlagents/trainers/sac/policy.py  | 11 +++--------
 ml-agents/mlagents/trainers/sac/trainer.py |  4 +---
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/ml-agents/mlagents/trainers/sac/policy.py b/ml-agents/mlagents/trainers/sac/policy.py
index 02ba111947..fa6eb10607 100644
--- a/ml-agents/mlagents/trainers/sac/policy.py
+++ b/ml-agents/mlagents/trainers/sac/policy.py
@@ -188,12 +188,7 @@ def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]:
 
     @timed
     def update(
-        # pylint: disable=arguments-differ
-        # TODO ervteng FIX ME
-        self,
-        mini_batch: Dict[str, Any],
-        num_sequences: int,
-        update_target: bool = True,
+        self, mini_batch: Dict[str, Any], num_sequences: int
     ) -> Dict[str, float]:
         """
         Updates model using buffer.
@@ -210,8 +205,8 @@ def update(
         update_vals = self._execute_model(feed_dict, self.update_dict)
         for stat_name, update_name in stats_needed.items():
             update_stats[stat_name] = update_vals[update_name]
-        if update_target:
-            self.sess.run(self.model.target_update_op)
+        # Update target network. By default, target update happens at every policy update.
+        self.sess.run(self.model.target_update_op)
         return update_stats
 
     def update_reward_signals(
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index df3b8b2c9b..6810e5dd0d 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -278,9 +278,7 @@ def update_sac_policy(self) -> None:
                         "{}_rewards".format(name)
                     ] = signal.evaluate_batch(sampled_minibatch).scaled_reward
 
-                update_stats = self.policy.update(
-                    sampled_minibatch, n_sequences, update_target=True
-                )
+                update_stats = self.policy.update(sampled_minibatch, n_sequences)
                 for stat_name, value in update_stats.items():
                     batch_update_stats[stat_name].append(value)