From 377a25c455acbb9a0e5af31cccf9df3916d06c9f Mon Sep 17 00:00:00 2001
From: Vincent-Pierre BERGES <vincentpierre@unity3d.com>
Date: Mon, 30 Mar 2020 14:18:31 -0700
Subject: [PATCH] Hotfixes for Release 0.15.1  (#3698)

* [bug-fix] Increase height of wall in CrawlerStatic (#3650)

* [bug-fix] Improve performance for PPO with continuous actions (#3662)

* Corrected a typo in a name of a function (#3670)

OnEpsiodeBegin was corrected to OnEpisodeBegin in Migrating.md document

* Add Academy.AutomaticSteppingEnabled to migration (#3666)

* Fix editor port in Dockerfile (#3674)

* Hotfix memory leak on Python (#3664)

* Hotfix memory leak on Python

* Fixing

* Fixing a bug in the heuristic policy. A decision should not be requested when the agent is done

* [bug-fix] Make Python able to deal with 0-step episodes (#3671)

* adding some comments

Co-authored-by: Ervin T <ervin@unity3d.com>

* Remove vis_encode_type from list of required (#3677)

* Update changelog (#3678)

* Shorten timeout duration for environment close (#3679)

The timeout duration for closing an environment was set to the
same duration as the timeout when waiting for a response from the
still-running environment.  This led to long waits for the error
response when communication version wasn't matching.

This change forces a timeout duration of 0 when handling errors.

* Bumping the versions

* handle multiple dones in a single step (#3700)

* handle multiple dones in a single step

* [tests] Make end-to-end tests more stable (#3697)

* [bug-fix] Fix entropy computation for GaussianDistribution (#3684)

* Fix how we set logging levels (#3703)

* cleanup logging

* comments and cleanup

* pylint, gym

* [skip-ci] Update changelog for logging fix. (#3707)

* [skip ci] Update README

* [skip ci] Fixed a typo

Co-authored-by: Ervin T <ervin@unity3d.com>
Co-authored-by: Adam Streck <adam.streck@gmail.com>
Co-authored-by: Chris Elion <chris.elion@unity3d.com>
Co-authored-by: Jonathan Harper <jharper+moar@unity3d.com>
---
 .pylintrc                                     |  2 +
 Dockerfile                                    |  6 ++-
 .../Crawler/Prefabs/FixedPlatform.prefab      | 13 +++--
 README.md                                     |  3 +-
 com.unity.ml-agents/CHANGELOG.md              | 11 +++++
 com.unity.ml-agents/Runtime/Academy.cs        |  2 +-
 com.unity.ml-agents/Runtime/Agent.cs          |  3 +-
 .../Runtime/Communicator/RpcCommunicator.cs   | 17 +++++--
 .../Runtime/Policies/HeuristicPolicy.cs       |  5 +-
 com.unity.ml-agents/package.json              |  2 +-
 docs/Migrating.md                             |  4 +-
 gym-unity/gym_unity/__init__.py               |  2 +-
 gym-unity/gym_unity/envs/__init__.py          | 32 ++++++++-----
 gym-unity/gym_unity/tests/test_gym.py         | 48 +++++++++++++++++++
 ml-agents-envs/mlagents_envs/__init__.py      |  2 +-
 ml-agents-envs/mlagents_envs/environment.py   | 23 ++++++---
 ml-agents-envs/mlagents_envs/logging_util.py  | 46 ++++++++++++++++++
 .../side_channel/outgoing_message.py          |  4 +-
 .../side_channel/side_channel.py              |  4 +-
 ml-agents/mlagents/logging_util.py            | 10 ----
 ml-agents/mlagents/model_serialization.py     |  5 +-
 ml-agents/mlagents/trainers/__init__.py       |  2 +-
 .../mlagents/trainers/agent_processor.py      | 31 +++++++-----
 .../components/reward_signals/__init__.py     |  5 +-
 ml-agents/mlagents/trainers/curriculum.py     |  4 +-
 ml-agents/mlagents/trainers/distributions.py  | 38 ++++++++++-----
 ml-agents/mlagents/trainers/env_manager.py    |  5 +-
 ml-agents/mlagents/trainers/ghost/trainer.py  |  5 +-
 ml-agents/mlagents/trainers/learn.py          | 19 ++++----
 .../mlagents/trainers/meta_curriculum.py      |  4 +-
 .../mlagents/trainers/policy/nn_policy.py     |  1 +
 .../mlagents/trainers/policy/tf_policy.py     | 23 +++------
 ml-agents/mlagents/trainers/ppo/trainer.py    |  4 +-
 ml-agents/mlagents/trainers/sac/optimizer.py  |  5 +-
 ml-agents/mlagents/trainers/sac/trainer.py    |  6 +--
 ml-agents/mlagents/trainers/stats.py          |  7 ++-
 .../trainers/subprocess_env_manager.py        |  5 +-
 .../trainers/tests/simple_test_envs.py        |  2 +-
 .../trainers/tests/test_agent_processor.py    |  9 ++++
 .../trainers/tests/test_distributions.py      | 10 +++-
 .../mlagents/trainers/tests/test_simple_rl.py | 23 ++++-----
 .../mlagents/trainers/trainer/trainer.py      |  5 +-
 .../mlagents/trainers/trainer_controller.py   |  4 +-
 ml-agents/mlagents/trainers/trainer_util.py   |  5 +-
 setup.cfg                                     |  1 +
 45 files changed, 320 insertions(+), 147 deletions(-)
 create mode 100644 ml-agents-envs/mlagents_envs/logging_util.py
 delete mode 100644 ml-agents/mlagents/logging_util.py

diff --git a/.pylintrc b/.pylintrc
index 40f89e8708..1da1dad0d4 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -44,3 +44,5 @@ disable =
     # Appears to be https://github.com/PyCQA/pylint/issues/2981
     W0201,
 
+    # Using the global statement
+    W0603,
diff --git a/Dockerfile b/Dockerfile
index 9262f6db3a..ee3fba449a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -132,7 +132,9 @@ COPY ml-agents /ml-agents
 WORKDIR /ml-agents
 RUN pip install -e .
 
-# port 5005 is the port used in in Editor training.
-EXPOSE 5005
+# Port 5004 is the port used in in Editor training.
+# Environments will start from port 5005, 
+# so allow enough ports for several environments.
+EXPOSE 5004-5050
 
 ENTRYPOINT ["mlagents-learn"]
diff --git a/Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab b/Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab
index 8d7c55b6e1..8e6da3ace8 100644
--- a/Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab
+++ b/Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab
@@ -1690,8 +1690,8 @@ MonoBehaviour:
   m_InferenceDevice: 0
   m_BehaviorType: 0
   m_BehaviorName: CrawlerStatic
-  m_TeamID: 0
-  m_useChildSensors: 1
+  TeamId: 0
+  m_UseChildSensors: 1
 --- !u!114 &114230237520033992
 MonoBehaviour:
   m_ObjectHideFlags: 0
@@ -1704,6 +1704,9 @@ MonoBehaviour:
   m_Script: {fileID: 11500000, guid: 2f37c30a5e8d04117947188818902ef3, type: 3}
   m_Name: 
   m_EditorClassIdentifier: 
+  agentParameters:
+    maxStep: 0
+  hasUpgradedFromAgentParameters: 1
   maxStep: 5000
   target: {fileID: 4749909135913778}
   ground: {fileID: 4856650706546504}
@@ -1759,7 +1762,7 @@ MonoBehaviour:
   m_Name: 
   m_EditorClassIdentifier: 
   DecisionPeriod: 5
-  RepeatAction: 0
+  TakeActionsBetweenDecisions: 0
   offsetStep: 0
 --- !u!1 &1492926997393242
 GameObject:
@@ -2959,8 +2962,8 @@ Transform:
   m_PrefabAsset: {fileID: 0}
   m_GameObject: {fileID: 1995322274649904}
   m_LocalRotation: {x: 0, y: -0, z: -0, w: 1}
-  m_LocalPosition: {x: -0, y: 0.5, z: 0}
-  m_LocalScale: {x: 0.01, y: 0.01, z: 0.01}
+  m_LocalPosition: {x: -0, y: 1.5, z: 0}
+  m_LocalScale: {x: 0.01, y: 0.03, z: 0.01}
   m_Children: []
   m_Father: {fileID: 4924174722017668}
   m_RootOrder: 1
diff --git a/README.md b/README.md
index 85c355d9b7..11aa0511d4 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ developer communities.
 * Train using concurrent Unity environment instances
 
 ## Releases & Documentation
-**Our latest, stable release is 0.15.0. Click
+**Our latest, stable release is 0.15.1. Click
 [here](docs/Readme.md) to
 get started with the latest release of ML-Agents.**
 
@@ -61,6 +61,7 @@ details of the changes between versions.
 
 | **Version** | **Release Date** | **Source** | **Documentation** | **Download** |
 |:-------:|:------:|:-------------:|:-------:|:------------:|
+| **0.15.0** | March 18, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0) |  [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.15.0.zip) |
 | **0.14.1** | February 26, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.14.1) |  [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.14.1/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.14.1.zip) |
 | **0.14.0**  | February 13, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.14.0) |  [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.14.0/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.14.0.zip) |
 | **0.13.1**  | January 21, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.13.1) |  [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.13.1/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.13.1.zip) |
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
index 61c530b825..b25d43a182 100755
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
@@ -5,6 +5,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
 
+## [0.15.1-preview] - 2020-03-30
+### Bug Fixes
+ - Raise the wall in CrawlerStatic scene to prevent Agent from falling off. (#3650)
+ - Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677)
+ - Fixed the reported entropy values for continuous actions (#3684)
+ - Fixed an issue where switching models using `SetModel()` during training would use an excessive amount of memory. (#3664)
+ - Environment subprocesses now close immediately on timeout or wrong API version. (#3679)
+ - Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700)
+ - Fixed an issue where logging output was not visible; logging levels are now set consistently (#3703).
+
+
 ## [0.15.0-preview] - 2020-03-18
 ### Major Changes
  - `Agent.CollectObservations` now takes a VectorSensor argument. (#3352, #3389)
diff --git a/com.unity.ml-agents/Runtime/Academy.cs b/com.unity.ml-agents/Runtime/Academy.cs
index 82d851e72b..7375e45eb9 100644
--- a/com.unity.ml-agents/Runtime/Academy.cs
+++ b/com.unity.ml-agents/Runtime/Academy.cs
@@ -64,7 +64,7 @@ public class Academy : IDisposable
         /// Unity package version of com.unity.ml-agents.
         /// This must match the version string in package.json and is checked in a unit test.
         /// </summary>
-        internal const string k_PackageVersion = "0.15.0-preview";
+        internal const string k_PackageVersion = "0.15.1-preview";
 
         const int k_EditorTrainingPort = 5004;
 
diff --git a/com.unity.ml-agents/Runtime/Agent.cs b/com.unity.ml-agents/Runtime/Agent.cs
index 838cd2cbf2..4e703f65ec 100644
--- a/com.unity.ml-agents/Runtime/Agent.cs
+++ b/com.unity.ml-agents/Runtime/Agent.cs
@@ -315,6 +315,7 @@ protected virtual void OnDisable()
 
         void NotifyAgentDone(DoneReason doneReason)
         {
+            m_Info.episodeId = m_EpisodeId;
             m_Info.reward = m_Reward;
             m_Info.done = true;
             m_Info.maxStepReached = doneReason == DoneReason.MaxStepReached;
@@ -376,7 +377,7 @@ public void SetModel(
                 // If everything is the same, don't make any changes.
                 return;
             }
-
+            NotifyAgentDone(DoneReason.Disabled);
             m_PolicyFactory.model = model;
             m_PolicyFactory.inferenceDevice = inferenceDevice;
             m_PolicyFactory.behaviorName = behaviorName;
diff --git a/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs b/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
index edbf1d9e64..15637b2582 100644
--- a/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
+++ b/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
@@ -458,13 +458,20 @@ UnityRLInitializationOutputProto GetTempUnityRlInitializationOutput()
             {
                 if (m_CurrentUnityRlOutput.AgentInfos.ContainsKey(behaviorName))
                 {
-                    if (output == null)
+                    if (m_CurrentUnityRlOutput.AgentInfos[behaviorName].CalculateSize() > 0)
                     {
-                        output = new UnityRLInitializationOutputProto();
-                    }
+                        // Only send the BrainParameters if there is a non empty list of
+                        // AgentInfos ready to be sent.
+                        // This is to ensure that The Python side will always have a first
+                        // observation when receiving the BrainParameters
+                        if (output == null)
+                        {
+                            output = new UnityRLInitializationOutputProto();
+                        }
 
-                    var brainParameters = m_UnsentBrainKeys[behaviorName];
-                    output.BrainParameters.Add(brainParameters.ToProto(behaviorName, true));
+                        var brainParameters = m_UnsentBrainKeys[behaviorName];
+                        output.BrainParameters.Add(brainParameters.ToProto(behaviorName, true));
+                    }
                 }
             }
 
diff --git a/com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs b/com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs
index 7806732a32..84c57ad73e 100644
--- a/com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs
+++ b/com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs
@@ -29,7 +29,10 @@ public HeuristicPolicy(Func<float[]> heuristic)
         public void RequestDecision(AgentInfo info, List<ISensor> sensors)
         {
             StepSensors(sensors);
-            m_LastDecision = m_Heuristic.Invoke();
+            if (!info.done)
+            {
+                m_LastDecision = m_Heuristic.Invoke();
+            }
         }
 
         /// <inheritdoc />
diff --git a/com.unity.ml-agents/package.json b/com.unity.ml-agents/package.json
index 7e4b1d4e74..faf047b163 100755
--- a/com.unity.ml-agents/package.json
+++ b/com.unity.ml-agents/package.json
@@ -1,7 +1,7 @@
 {
   "name": "com.unity.ml-agents",
   "displayName": "ML Agents",
-  "version": "0.15.0-preview",
+  "version": "0.15.1-preview",
   "unity": "2018.4",
   "description": "Add interactivity to your game with Machine Learning Agents trained using Deep Reinforcement Learning.",
   "dependencies": {
diff --git a/docs/Migrating.md b/docs/Migrating.md
index c9a99efbba..654fd3ba7c 100644
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
@@ -34,6 +34,7 @@ The versions can be found in
 * The interface for SideChannels was changed:
   * In C#, `OnMessageReceived` now takes a `IncomingMessage` argument, and `QueueMessageToSend` takes an `OutgoingMessage` argument.
   * In python, `on_message_received` now takes a `IncomingMessage` argument, and `queue_message_to_send` takes an `OutgoingMessage` argument.
+  * Automatic stepping for Academy is now controlled from the AutomaticSteppingEnabled property.
 
 ### Steps to Migrate
 * Add the `using MLAgents.Sensors;` in addition to `using MLAgents;` on top of your Agent's script.
@@ -45,11 +46,12 @@ The versions can be found in
 * We strongly recommend replacing the following methods with their new equivalent as they will be removed in a later release:
   * `InitializeAgent()` to `Initialize()`
   * `AgentAction()` to `OnActionReceived()`
-  * `AgentReset()` to `OnEpsiodeBegin()`
+  * `AgentReset()` to `OnEpisodeBegin()`
   * `Done()` to `EndEpisode()`
   * `GiveModel()` to `SetModel()`
 * Replace `IFloatProperties` variables with `FloatPropertiesChannel` variables.
 * If you implemented custom `SideChannels`, update the signatures of your methods, and add your data to the `OutgoingMessage` or read it from the `IncomingMessage`.
+* Replace calls to Academy.EnableAutomaticStepping()/DisableAutomaticStepping() with Academy.AutomaticSteppingEnabled = true/false.
 
 ## Migrating from 0.13 to 0.14
 
diff --git a/gym-unity/gym_unity/__init__.py b/gym-unity/gym_unity/__init__.py
index 9da2f8fcca..903e77ce1b 100644
--- a/gym-unity/gym_unity/__init__.py
+++ b/gym-unity/gym_unity/__init__.py
@@ -1 +1 @@
-__version__ = "0.15.0"
+__version__ = "0.15.1"
diff --git a/gym-unity/gym_unity/envs/__init__.py b/gym-unity/gym_unity/envs/__init__.py
index 0066c71664..5077758d96 100644
--- a/gym-unity/gym_unity/envs/__init__.py
+++ b/gym-unity/gym_unity/envs/__init__.py
@@ -1,4 +1,3 @@
-import logging
 import itertools
 import numpy as np
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -8,6 +7,7 @@
 
 from mlagents_envs.environment import UnityEnvironment
 from mlagents_envs.base_env import BatchedStepResult
+from mlagents_envs import logging_util
 
 
 class UnityGymException(error.Error):
@@ -18,9 +18,8 @@ class UnityGymException(error.Error):
     pass
 
 
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("gym_unity")
-
+logger = logging_util.get_logger(__name__)
+logging_util.set_log_level(logging_util.INFO)
 
 GymSingleStepResult = Tuple[np.ndarray, float, bool, Dict]
 GymMultiStepResult = Tuple[List[np.ndarray], List[float], List[bool], Dict]
@@ -364,9 +363,8 @@ def _check_agents(self, n_agents: int) -> None:
 
     def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
         n_extra_agents = step_result.n_agents() - self._n_agents
-        if n_extra_agents < 0 or n_extra_agents > self._n_agents:
+        if n_extra_agents < 0:
             # In this case, some Agents did not request a decision when expected
-            # or too many requested a decision
             raise UnityGymException(
                 "The number of agents in the scene does not match the expected number."
             )
@@ -386,6 +384,10 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
         # only cares about the ordering.
         for index, agent_id in enumerate(step_result.agent_id):
             if not self._previous_step_result.contains_agent(agent_id):
+                if step_result.done[index]:
+                    # If the Agent is already done (e.g. it ended its epsiode twice in one step)
+                    # Don't try to register it here.
+                    continue
                 # Register this agent, and get the reward of the previous agent that
                 # was in its index, so that we can return it to the gym.
                 last_reward = self.agent_mapper.register_new_agent_id(agent_id)
@@ -528,8 +530,12 @@ def mark_agent_done(self, agent_id: int, reward: float) -> None:
         """
         Declare the agent done with the corresponding final reward.
         """
-        gym_index = self._agent_id_to_gym_index.pop(agent_id)
-        self._done_agents_index_to_last_reward[gym_index] = reward
+        if agent_id in self._agent_id_to_gym_index:
+            gym_index = self._agent_id_to_gym_index.pop(agent_id)
+            self._done_agents_index_to_last_reward[gym_index] = reward
+        else:
+            # Agent was never registered in the first place (e.g. EndEpisode called multiple times)
+            pass
 
     def register_new_agent_id(self, agent_id: int) -> float:
         """
@@ -581,9 +587,13 @@ def set_initial_agents(self, agent_ids: List[int]) -> None:
         self._gym_id_order = list(agent_ids)
 
     def mark_agent_done(self, agent_id: int, reward: float) -> None:
-        gym_index = self._gym_id_order.index(agent_id)
-        self._done_agents_index_to_last_reward[gym_index] = reward
-        self._gym_id_order[gym_index] = -1
+        try:
+            gym_index = self._gym_id_order.index(agent_id)
+            self._done_agents_index_to_last_reward[gym_index] = reward
+            self._gym_id_order[gym_index] = -1
+        except ValueError:
+            # Agent was never registered in the first place (e.g. EndEpisode called multiple times)
+            pass
 
     def register_new_agent_id(self, agent_id: int) -> float:
         original_index = self._gym_id_order.index(-1)
diff --git a/gym-unity/gym_unity/tests/test_gym.py b/gym-unity/gym_unity/tests/test_gym.py
index 3ee6e67913..1e691397e0 100644
--- a/gym-unity/gym_unity/tests/test_gym.py
+++ b/gym-unity/gym_unity/tests/test_gym.py
@@ -129,6 +129,50 @@ def test_sanitize_action_one_agent_done(mock_env):
         assert expected_agent_id == agent_id
 
 
+@mock.patch("gym_unity.envs.UnityEnvironment")
+def test_sanitize_action_new_agent_done(mock_env):
+    mock_spec = create_mock_group_spec(
+        vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
+    )
+    mock_step = create_mock_vector_step_result(num_agents=3)
+    mock_step.agent_id = np.array(range(5))
+    setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
+    env = UnityEnv(" ", use_visual=False, multiagent=True)
+
+    received_step_result = create_mock_vector_step_result(num_agents=7)
+    received_step_result.agent_id = np.array(range(7))
+    # agent #3 (id = 2) is Done
+    # so is the "new" agent (id = 5)
+    done = [False] * 7
+    done[2] = True
+    done[5] = True
+    received_step_result.done = np.array(done)
+    sanitized_result = env._sanitize_info(received_step_result)
+    for expected_agent_id, agent_id in zip([0, 1, 6, 3, 4], sanitized_result.agent_id):
+        assert expected_agent_id == agent_id
+
+
+@mock.patch("gym_unity.envs.UnityEnvironment")
+def test_sanitize_action_single_agent_multiple_done(mock_env):
+    mock_spec = create_mock_group_spec(
+        vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
+    )
+    mock_step = create_mock_vector_step_result(num_agents=1)
+    mock_step.agent_id = np.array(range(1))
+    setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
+    env = UnityEnv(" ", use_visual=False, multiagent=False)
+
+    received_step_result = create_mock_vector_step_result(num_agents=3)
+    received_step_result.agent_id = np.array(range(3))
+    # original agent (id = 0) is Done
+    # so is the "new" agent (id = 1)
+    done = [True, True, False]
+    received_step_result.done = np.array(done)
+    sanitized_result = env._sanitize_info(received_step_result)
+    for expected_agent_id, agent_id in zip([2], sanitized_result.agent_id):
+        assert expected_agent_id == agent_id
+
+
 # Helper methods
 
 
@@ -200,6 +244,10 @@ def test_agent_id_index_mapper(mapper_cls):
     mapper.mark_agent_done(1001, 42.0)
     mapper.mark_agent_done(1004, 1337.0)
 
+    # Make sure we can handle an unknown agent id being marked done.
+    # This can happen when an agent ends an episode on the same step it starts.
+    mapper.mark_agent_done(9999, -1.0)
+
     # Now add new agents, and get the rewards of the agent they replaced.
     old_reward1 = mapper.register_new_agent_id(2001)
     old_reward2 = mapper.register_new_agent_id(2002)
diff --git a/ml-agents-envs/mlagents_envs/__init__.py b/ml-agents-envs/mlagents_envs/__init__.py
index 9da2f8fcca..903e77ce1b 100644
--- a/ml-agents-envs/mlagents_envs/__init__.py
+++ b/ml-agents-envs/mlagents_envs/__init__.py
@@ -1 +1 @@
-__version__ = "0.15.0"
+__version__ = "0.15.1"
diff --git a/ml-agents-envs/mlagents_envs/environment.py b/ml-agents-envs/mlagents_envs/environment.py
index 00f421cd1c..8cdb48d45b 100644
--- a/ml-agents-envs/mlagents_envs/environment.py
+++ b/ml-agents-envs/mlagents_envs/environment.py
@@ -1,13 +1,14 @@
 import atexit
 import glob
 import uuid
-import logging
 import numpy as np
 import os
 import subprocess
 from typing import Dict, List, Optional, Any
 
 import mlagents_envs
+
+from mlagents_envs.logging_util import get_logger
 from mlagents_envs.side_channel.side_channel import SideChannel, IncomingMessage
 
 from mlagents_envs.base_env import (
@@ -47,7 +48,7 @@
 import struct
 
 
-logger = logging.getLogger("mlagents_envs")
+logger = get_logger(__name__)
 
 
 class UnityEnvironment(BaseEnv):
@@ -142,12 +143,12 @@ def __init__(
             aca_output = self.send_academy_parameters(rl_init_parameters_in)
             aca_params = aca_output.rl_initialization_output
         except UnityTimeOutException:
-            self._close()
+            self._close(0)
             raise
 
         unity_communicator_version = aca_params.communication_version
         if unity_communicator_version != UnityEnvironment.API_VERSION:
-            self._close()
+            self._close(0)
             raise UnityEnvironmentException(
                 f"The communication API version is not compatible between Unity and python. "
                 f"Python API: {UnityEnvironment.API_VERSION}, Unity API: {unity_communicator_version}.\n "
@@ -228,7 +229,7 @@ def validate_environment_path(env_path: str) -> Optional[str]:
     def executable_launcher(self, file_name, docker_training, no_graphics, args):
         launch_string = self.validate_environment_path(file_name)
         if launch_string is None:
-            self._close()
+            self._close(0)
             raise UnityEnvironmentException(
                 f"Couldn't launch the {file_name} environment. Provided filename does not match any environments."
             )
@@ -433,13 +434,21 @@ def close(self):
         else:
             raise UnityEnvironmentException("No Unity environment is loaded.")
 
-    def _close(self):
+    def _close(self, timeout: Optional[int] = None) -> None:
+        """
+        Close the communicator and environment subprocess (if necessary).
+
+        :int timeout: [Optional] Number of seconds to wait for the environment to shut down before
+            force-killing it.  Defaults to `self.timeout_wait`.
+        """
+        if timeout is None:
+            timeout = self.timeout_wait
         self._loaded = False
         self.communicator.close()
         if self.proc1 is not None:
             # Wait a bit for the process to shutdown, but kill it if it takes too long
             try:
-                self.proc1.wait(timeout=self.timeout_wait)
+                self.proc1.wait(timeout=timeout)
                 signal_name = self.returncode_to_signal_name(self.proc1.returncode)
                 signal_name = f" ({signal_name})" if signal_name else ""
                 return_info = f"Environment shut down with return code {self.proc1.returncode}{signal_name}."
diff --git a/ml-agents-envs/mlagents_envs/logging_util.py b/ml-agents-envs/mlagents_envs/logging_util.py
new file mode 100644
index 0000000000..b768fc28ca
--- /dev/null
+++ b/ml-agents-envs/mlagents_envs/logging_util.py
@@ -0,0 +1,46 @@
+import logging  # noqa I251
+
+CRITICAL = logging.CRITICAL
+FATAL = logging.FATAL
+ERROR = logging.ERROR
+WARNING = logging.WARNING
+INFO = logging.INFO
+DEBUG = logging.DEBUG
+NOTSET = logging.NOTSET
+
+_loggers = set()
+_log_level = NOTSET
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+LOG_FORMAT = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+
+def get_logger(name: str) -> logging.Logger:
+    """
+    Create a logger with the specified name. The logger will use the log level
+    specified by set_log_level()
+    """
+    logger = logging.getLogger(name=name)
+
+    # If we've already set the log level, make sure new loggers use it
+    if _log_level != NOTSET:
+        logger.setLevel(_log_level)
+
+    # Keep track of this logger so that we can change the log level later
+    _loggers.add(logger)
+    return logger
+
+
+def set_log_level(log_level: int) -> None:
+    """
+    Set the ML-Agents logging level. This will also configure the logging format (if it hasn't already been set).
+    """
+    global _log_level
+    _log_level = log_level
+
+    # Configure the log format.
+    # In theory, this would be sufficient, but if another library calls logging.basicConfig
+    # first, it doesn't have any effect.
+    logging.basicConfig(level=_log_level, format=LOG_FORMAT, datefmt=DATE_FORMAT)
+
+    for logger in _loggers:
+        logger.setLevel(log_level)
diff --git a/ml-agents-envs/mlagents_envs/side_channel/outgoing_message.py b/ml-agents-envs/mlagents_envs/side_channel/outgoing_message.py
index 835349cbd2..83bbe3446c 100644
--- a/ml-agents-envs/mlagents_envs/side_channel/outgoing_message.py
+++ b/ml-agents-envs/mlagents_envs/side_channel/outgoing_message.py
@@ -1,9 +1,9 @@
 from typing import List
 import struct
 
-import logging
+from mlagents_envs.logging_util import get_logger
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 
 class OutgoingMessage:
diff --git a/ml-agents-envs/mlagents_envs/side_channel/side_channel.py b/ml-agents-envs/mlagents_envs/side_channel/side_channel.py
index 52f2a106f1..469cb51eab 100644
--- a/ml-agents-envs/mlagents_envs/side_channel/side_channel.py
+++ b/ml-agents-envs/mlagents_envs/side_channel/side_channel.py
@@ -1,11 +1,11 @@
 from abc import ABC, abstractmethod
 from typing import List
 import uuid
-import logging
 
 from mlagents_envs.side_channel import IncomingMessage, OutgoingMessage
+from mlagents_envs.logging_util import get_logger
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 
 class SideChannel(ABC):
diff --git a/ml-agents/mlagents/logging_util.py b/ml-agents/mlagents/logging_util.py
deleted file mode 100644
index a9478023d7..0000000000
--- a/ml-agents/mlagents/logging_util.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import logging
-
-
-def create_logger(name, log_level):
-    date_format = "%Y-%m-%d %H:%M:%S"
-    log_format = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(level=log_level, format=log_format, datefmt=date_format)
-    logger = logging.getLogger(name=name)
-    return logger
diff --git a/ml-agents/mlagents/model_serialization.py b/ml-agents/mlagents/model_serialization.py
index fdd322ab0a..ee1bd0c85e 100644
--- a/ml-agents/mlagents/model_serialization.py
+++ b/ml-agents/mlagents/model_serialization.py
@@ -1,6 +1,5 @@
 from distutils.util import strtobool
 import os
-import logging
 from typing import Any, List, Set, NamedTuple
 from distutils.version import LooseVersion
 
@@ -19,14 +18,16 @@
 
 from tensorflow.python.platform import gfile
 from tensorflow.python.framework import graph_util
+
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers import tensorflow_to_barracuda as tf2bc
 
 if LooseVersion(tf.__version__) < LooseVersion("1.12.0"):
     # ONNX is only tested on 1.12.0 and later
     ONNX_EXPORT_ENABLED = False
 
+logger = get_logger(__name__)
 
-logger = logging.getLogger("mlagents.trainers")
 
 POSSIBLE_INPUT_NODES = frozenset(
     [
diff --git a/ml-agents/mlagents/trainers/__init__.py b/ml-agents/mlagents/trainers/__init__.py
index 9da2f8fcca..903e77ce1b 100644
--- a/ml-agents/mlagents/trainers/__init__.py
+++ b/ml-agents/mlagents/trainers/__init__.py
@@ -1 +1 @@
-__version__ = "0.15.0"
+__version__ = "0.15.1"
diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py
index 97ddf74b85..50a3a4796b 100644
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
@@ -1,5 +1,5 @@
 import sys
-from typing import List, Dict, Deque, TypeVar, Generic, Tuple, Set
+from typing import List, Dict, Deque, TypeVar, Generic, Tuple, Any
 from collections import defaultdict, Counter, deque
 
 from mlagents_envs.base_env import BatchedStepResult, StepResult
@@ -66,7 +66,6 @@ def add_experiences(
             for _entropy in take_action_outputs["entropy"]:
                 self.stats_reporter.add_stat("Policy/Entropy", _entropy)
 
-        terminated_agents: Set[str] = set()
         # Make unique agent_ids that are global across workers
         action_global_agent_ids = [
             get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
@@ -85,6 +84,7 @@ def add_experiences(
             stored_take_action_outputs = self.last_take_action_outputs.get(
                 global_id, None
             )
+
             if stored_agent_step is not None and stored_take_action_outputs is not None:
                 # We know the step is from the same worker, so use the local agent id.
                 obs = stored_agent_step.obs
@@ -143,6 +143,8 @@ def add_experiences(
                         traj_queue.put(trajectory)
                     self.experience_buffers[global_id] = []
                     if curr_agent_step.done:
+                        # Record episode length for agents which have had at least
+                        # 1 step. Done after reset ignored.
                         self.stats_reporter.add_stat(
                             "Environment/Cumulative Reward",
                             self.episode_rewards.get(global_id, 0),
@@ -151,7 +153,6 @@ def add_experiences(
                             "Environment/Episode Length",
                             self.episode_steps.get(global_id, 0),
                         )
-                        terminated_agents.add(global_id)
                 elif not curr_agent_step.done:
                     self.episode_steps[global_id] += 1
 
@@ -160,9 +161,9 @@ def add_experiences(
                 curr_agent_step,
                 batched_step_result.agent_id_to_index[_id],
             )
-
-        for terminated_id in terminated_agents:
-            self._clean_agent_data(terminated_id)
+            # Delete all done agents, regardless of if they had a 0-length episode.
+            if curr_agent_step.done:
+                self._clean_agent_data(global_id)
 
         for _gid in action_global_agent_ids:
             # If the ID doesn't have a last step result, the agent just reset,
@@ -177,14 +178,22 @@ def _clean_agent_data(self, global_id: str) -> None:
         """
         Removes the data for an Agent.
         """
-        del self.experience_buffers[global_id]
-        del self.last_take_action_outputs[global_id]
-        del self.last_step_result[global_id]
-        del self.episode_steps[global_id]
-        del self.episode_rewards[global_id]
+        self._safe_delete(self.experience_buffers, global_id)
+        self._safe_delete(self.last_take_action_outputs, global_id)
+        self._safe_delete(self.last_step_result, global_id)
+        self._safe_delete(self.episode_steps, global_id)
+        self._safe_delete(self.episode_rewards, global_id)
         self.policy.remove_previous_action([global_id])
         self.policy.remove_memories([global_id])
 
+    def _safe_delete(self, my_dictionary: Dict[Any, Any], key: Any) -> None:
+        """
+        Safe removes data from a dictionary. If not found,
+        don't delete.
+        """
+        if key in my_dictionary:
+            del my_dictionary[key]
+
     def publish_trajectory_queue(
         self, trajectory_queue: "AgentManagerQueue[Trajectory]"
     ) -> None:
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
index 72c719e64f..9c721b4d64 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
@@ -1,4 +1,3 @@
-import logging
 from typing import Any, Dict, List
 from collections import namedtuple
 import numpy as np
@@ -6,11 +5,13 @@
 
 from mlagents.tf_utils import tf
 
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.buffer import AgentBuffer
 
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)
 
 RewardSignalResult = namedtuple(
     "RewardSignalResult", ["scaled_reward", "unscaled_reward"]
diff --git a/ml-agents/mlagents/trainers/curriculum.py b/ml-agents/mlagents/trainers/curriculum.py
index 0eba472265..12eaa7a5e7 100644
--- a/ml-agents/mlagents/trainers/curriculum.py
+++ b/ml-agents/mlagents/trainers/curriculum.py
@@ -4,9 +4,9 @@
 
 from .exception import CurriculumConfigError, CurriculumLoadingError
 
-import logging
+from mlagents_envs.logging_util import get_logger
 
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)
 
 
 class Curriculum:
diff --git a/ml-agents/mlagents/trainers/distributions.py b/ml-agents/mlagents/trainers/distributions.py
index a608f45932..294bad11cb 100644
--- a/ml-agents/mlagents/trainers/distributions.py
+++ b/ml-agents/mlagents/trainers/distributions.py
@@ -64,6 +64,7 @@ def __init__(
         act_size: List[int],
         reparameterize: bool = False,
         tanh_squash: bool = False,
+        condition_sigma: bool = True,
         log_sigma_min: float = -20,
         log_sigma_max: float = 2,
     ):
@@ -79,7 +80,11 @@ def __init__(
         :param log_sigma_max: Maximum log standard deviation to clip by.
         """
         encoded = self._create_mu_log_sigma(
-            logits, act_size, log_sigma_min, log_sigma_max
+            logits,
+            act_size,
+            log_sigma_min,
+            log_sigma_max,
+            condition_sigma=condition_sigma,
         )
         self._sampled_policy = self._create_sampled_policy(encoded)
         if not reparameterize:
@@ -101,6 +106,7 @@ def _create_mu_log_sigma(
         act_size: List[int],
         log_sigma_min: float,
         log_sigma_max: float,
+        condition_sigma: bool,
     ) -> "GaussianDistribution.MuSigmaTensors":
 
         mu = tf.layers.dense(
@@ -112,14 +118,22 @@ def _create_mu_log_sigma(
             reuse=tf.AUTO_REUSE,
         )
 
-        # Policy-dependent log_sigma_sq
-        log_sigma = tf.layers.dense(
-            logits,
-            act_size[0],
-            activation=None,
-            name="log_std",
-            kernel_initializer=ModelUtils.scaled_init(0.01),
-        )
+        if condition_sigma:
+            # Policy-dependent log_sigma_sq
+            log_sigma = tf.layers.dense(
+                logits,
+                act_size[0],
+                activation=None,
+                name="log_std",
+                kernel_initializer=ModelUtils.scaled_init(0.01),
+            )
+        else:
+            log_sigma = tf.get_variable(
+                "log_std",
+                [act_size[0]],
+                dtype=tf.float32,
+                initializer=tf.zeros_initializer(),
+            )
         log_sigma = tf.clip_by_value(log_sigma, log_sigma_min, log_sigma_max)
         sigma = tf.exp(log_sigma)
         return self.MuSigmaTensors(mu, log_sigma, sigma)
@@ -146,7 +160,7 @@ def _create_entropy(
         self, encoded: "GaussianDistribution.MuSigmaTensors"
     ) -> tf.Tensor:
         single_dim_entropy = 0.5 * tf.reduce_mean(
-            tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma)
+            tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma
         )
         # Make entropy the right shape
         return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy
@@ -155,8 +169,8 @@ def _do_squash_correction_for_tanh(self, probs, squashed_policy):
         """
         Adjust probabilities for squashed sample before output
         """
-        probs -= tf.log(1 - squashed_policy ** 2 + EPSILON)
-        return probs
+        adjusted_probs = probs - tf.log(1 - squashed_policy ** 2 + EPSILON)
+        return adjusted_probs
 
     @property
     def total_log_probs(self) -> tf.Tensor:
diff --git a/ml-agents/mlagents/trainers/env_manager.py b/ml-agents/mlagents/trainers/env_manager.py
index 0a20a48203..23288e12b4 100644
--- a/ml-agents/mlagents/trainers/env_manager.py
+++ b/ml-agents/mlagents/trainers/env_manager.py
@@ -1,16 +1,17 @@
 from abc import ABC, abstractmethod
-import logging
 from typing import List, Dict, NamedTuple, Iterable
 from mlagents_envs.base_env import BatchedStepResult, AgentGroupSpec, AgentGroup
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue
 from mlagents.trainers.action_info import ActionInfo
+from mlagents_envs.logging_util import get_logger
 
 AllStepResult = Dict[AgentGroup, BatchedStepResult]
 AllGroupSpec = Dict[AgentGroup, AgentGroupSpec]
 
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)
 
 
 class EnvironmentStep(NamedTuple):
diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
index 7abf65d1cb..c952e42fbe 100644
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -4,8 +4,8 @@
 from typing import Deque, Dict, List, Any, cast
 
 import numpy as np
-import logging
 
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.policy.tf_policy import TFPolicy
@@ -14,7 +14,8 @@
 from mlagents.trainers.trajectory import Trajectory
 from mlagents.trainers.agent_processor import AgentManagerQueue
 
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)
 
 
 class GhostTrainer(Trainer):
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index fdc159997a..b793f433e9 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -1,5 +1,4 @@
 # # Unity ML-Agents Toolkit
-import logging
 import argparse
 
 import os
@@ -31,7 +30,9 @@
 from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig
 from mlagents_envs.exception import UnityEnvironmentException
 from mlagents_envs.timers import hierarchical_timer, get_timer_tree
-from mlagents.logging_util import create_logger
+from mlagents_envs import logging_util
+
+logger = logging_util.get_logger(__name__)
 
 
 def _create_parser():
@@ -338,7 +339,7 @@ def write_timing_tree(summaries_dir: str, run_id: str) -> None:
         with open(timing_path, "w") as f:
             json.dump(get_timer_tree(), f, indent=4)
     except FileNotFoundError:
-        logging.warning(
+        logger.warning(
             f"Unable to save to {timing_path}. Make sure the directory exists"
         )
 
@@ -395,7 +396,7 @@ def prepare_for_docker_run(docker_target_name, env_path):
                     shutil.copyfile(src_f, dst_f)
                     os.chmod(dst_f, 0o775)  # Make executable
             except Exception as e:
-                logging.getLogger("mlagents.trainers").info(e)
+                logger.info(e)
     env_path = "/ml-agents/{env_path}".format(env_path=env_path)
     return env_path
 
@@ -471,16 +472,16 @@ def run_cli(options: RunOptions) -> None:
     print(get_version_string())
 
     if options.debug:
-        log_level = logging.DEBUG
+        log_level = logging_util.DEBUG
     else:
-        log_level = logging.INFO
+        log_level = logging_util.INFO
         # disable noisy warnings from tensorflow
         tf_utils.set_warnings_enabled(False)
 
-    trainer_logger = create_logger("mlagents.trainers", log_level)
+    logging_util.set_log_level(log_level)
 
-    trainer_logger.debug("Configuration for this run:")
-    trainer_logger.debug(json.dumps(options._asdict(), indent=4))
+    logger.debug("Configuration for this run:")
+    logger.debug(json.dumps(options._asdict(), indent=4))
 
     run_seed = options.seed
     if options.cpu:
diff --git a/ml-agents/mlagents/trainers/meta_curriculum.py b/ml-agents/mlagents/trainers/meta_curriculum.py
index 60ea9cc5c5..699890359f 100644
--- a/ml-agents/mlagents/trainers/meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/meta_curriculum.py
@@ -3,9 +3,9 @@
 from typing import Dict, Set
 from mlagents.trainers.curriculum import Curriculum
 
-import logging
+from mlagents_envs.logging_util import get_logger
 
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)
 
 
 class MetaCurriculum:
diff --git a/ml-agents/mlagents/trainers/policy/nn_policy.py b/ml-agents/mlagents/trainers/policy/nn_policy.py
index 528afa6a8b..a064b9d192 100644
--- a/ml-agents/mlagents/trainers/policy/nn_policy.py
+++ b/ml-agents/mlagents/trainers/policy/nn_policy.py
@@ -202,6 +202,7 @@ def _create_cc_actor(
                 self.act_size,
                 reparameterize=reparameterize,
                 tanh_squash=tanh_squash,
+                condition_sigma=condition_sigma_on_obs,
             )
 
         if tanh_squash:
diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
index a92fb34a38..9d0c02a57a 100644
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -1,10 +1,10 @@
-import logging
 from typing import Any, Dict, List, Optional
 import abc
 import numpy as np
 from mlagents.tf_utils import tf
 from mlagents import tf_utils
 from mlagents_envs.exception import UnityException
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.action_info import ActionInfo
 from mlagents.trainers.trajectory import SplitObservations
@@ -13,7 +13,7 @@
 from mlagents.trainers.models import ModelUtils
 
 
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)
 
 
 class UnityPolicyException(UnityException):
@@ -174,17 +174,6 @@ def get_action(
         if batched_step_result.n_agents() == 0:
             return ActionInfo.empty()
 
-        agents_done = [
-            agent
-            for agent, done in zip(
-                batched_step_result.agent_id, batched_step_result.done
-            )
-            if done
-        ]
-
-        self.remove_memories(agents_done)
-        self.remove_previous_action(agents_done)
-
         global_agent_ids = [
             get_global_agent_id(worker_id, int(agent_id))
             for agent_id in batched_step_result.agent_id
@@ -379,9 +368,11 @@ def _initialize_tensorflow_references(self):
 
     def create_input_placeholders(self):
         with self.graph.as_default():
-            self.global_step, self.increment_step_op, self.steps_to_increment = (
-                ModelUtils.create_global_steps()
-            )
+            (
+                self.global_step,
+                self.increment_step_op,
+                self.steps_to_increment,
+            ) = ModelUtils.create_global_steps()
             self.visual_in = ModelUtils.create_visual_input_placeholders(
                 self.brain.camera_resolutions
             )
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index 308ee975e3..ecbb30a383 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -2,11 +2,11 @@
 # ## ML-Agent Learning (PPO)
 # Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
 
-import logging
 from collections import defaultdict
 
 import numpy as np
 
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.policy.nn_policy import NNPolicy
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
 from mlagents.trainers.brain import BrainParameters
@@ -16,7 +16,7 @@
 from mlagents.trainers.exception import UnityTrainerException
 
 
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)
 
 
 class PPOTrainer(RLTrainer):
diff --git a/ml-agents/mlagents/trainers/sac/optimizer.py b/ml-agents/mlagents/trainers/sac/optimizer.py
index 963595fb28..72494f4fe9 100644
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
@@ -1,9 +1,9 @@
-import logging
 import numpy as np
 from typing import Dict, List, Optional, Any, Mapping
 
 from mlagents.tf_utils import tf
 
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
 from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
@@ -13,7 +13,7 @@
 
 EPSILON = 1e-6  # Small value to avoid divide by zero
 
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)
 
 POLICY_SCOPE = ""
 TARGET_SCOPE = "target_network"
@@ -155,7 +155,6 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
             "q1_loss": self.q1_loss,
             "q2_loss": self.q2_loss,
             "entropy_coef": self.ent_coef,
-            "entropy": self.policy.entropy,
             "update_batch": self.update_batch_policy,
             "update_value": self.update_batch_value,
             "update_entropy": self.update_batch_entropy,
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index 9ab025de28..f0a58779e5 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -2,7 +2,6 @@
 # Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290
 # and implemented in https://github.com/hill-a/stable-baselines
 
-import logging
 from collections import defaultdict
 from typing import Dict
 import os
@@ -10,6 +9,7 @@
 import numpy as np
 
 
+from mlagents_envs.logging_util import get_logger
 from mlagents_envs.timers import timed
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.policy.nn_policy import NNPolicy
@@ -20,7 +20,8 @@
 from mlagents.trainers.exception import UnityTrainerException
 
 
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)
+
 BUFFER_TRUNCATE_PERCENT = 0.8
 
 
@@ -73,7 +74,6 @@ def __init__(
             "memory_size",
             "model_path",
             "reward_signals",
-            "vis_encode_type",
         ]
 
         self._check_param_keys()
diff --git a/ml-agents/mlagents/trainers/stats.py b/ml-agents/mlagents/trainers/stats.py
index 70a417e3a3..53499c0e5c 100644
--- a/ml-agents/mlagents/trainers/stats.py
+++ b/ml-agents/mlagents/trainers/stats.py
@@ -4,9 +4,12 @@
 import abc
 import csv
 import os
-
-from mlagents.tf_utils import tf
+from mlagents_envs.logging_util import get_logger
 from mlagents_envs.timers import set_gauge
+from mlagents.tf_utils import tf
+
+
+logger = get_logger(__name__)
 
 
 class StatsSummary(NamedTuple):
diff --git a/ml-agents/mlagents/trainers/subprocess_env_manager.py b/ml-agents/mlagents/trainers/subprocess_env_manager.py
index 9164ec475f..f24841b0b1 100644
--- a/ml-agents/mlagents/trainers/subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/subprocess_env_manager.py
@@ -1,4 +1,3 @@
-import logging
 from typing import Dict, NamedTuple, List, Any, Optional, Callable, Set
 import cloudpickle
 
@@ -8,6 +7,7 @@
 from multiprocessing.connection import Connection
 from queue import Empty as EmptyQueueException
 from mlagents_envs.base_env import BaseEnv, AgentGroup
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
 from mlagents_envs.timers import (
     TimerNode,
@@ -26,7 +26,8 @@
 from mlagents_envs.side_channel.side_channel import SideChannel
 from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters
 
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)
 
 
 class EnvironmentCommand(NamedTuple):
diff --git a/ml-agents/mlagents/trainers/tests/simple_test_envs.py b/ml-agents/mlagents/trainers/tests/simple_test_envs.py
index 66a0abc3e0..d0817b0e60 100644
--- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py
@@ -13,7 +13,7 @@
 VIS_OBS_SIZE = (20, 20, 3)
 STEP_SIZE = 0.1
 
-TIME_PENALTY = 0.001
+TIME_PENALTY = 0.01
 MIN_STEPS = int(1.0 / STEP_SIZE) + 1
 SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY
 
diff --git a/ml-agents/mlagents/trainers/tests/test_agent_processor.py b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
index 0a3083bf2b..4ca4146427 100644
--- a/ml-agents/mlagents/trainers/tests/test_agent_processor.py
+++ b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
@@ -152,6 +152,15 @@ def test_agent_deletion():
     assert len(processor.last_take_action_outputs.keys()) == 0
     assert len(processor.episode_steps.keys()) == 0
     assert len(processor.episode_rewards.keys()) == 0
+    assert len(processor.last_step_result.keys()) == 0
+
+    # check that steps with immediate dones don't add to dicts
+    processor.add_experiences(mock_done_step, 0, ActionInfo.empty())
+    assert len(processor.experience_buffers.keys()) == 0
+    assert len(processor.last_take_action_outputs.keys()) == 0
+    assert len(processor.episode_steps.keys()) == 0
+    assert len(processor.episode_rewards.keys()) == 0
+    assert len(processor.last_step_result.keys()) == 0
 
 
 def test_end_episode():
diff --git a/ml-agents/mlagents/trainers/tests/test_distributions.py b/ml-agents/mlagents/trainers/tests/test_distributions.py
index 751894bd3f..c27047fd69 100644
--- a/ml-agents/mlagents/trainers/tests/test_distributions.py
+++ b/ml-agents/mlagents/trainers/tests/test_distributions.py
@@ -53,7 +53,7 @@ def dummy_config():
 
 def test_gaussian_distribution():
     with tf.Graph().as_default():
-        logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32)
+        logits = tf.Variable(initial_value=[[1, 1]], trainable=True, dtype=tf.float32)
         distribution = GaussianDistribution(
             logits,
             act_size=VECTOR_ACTION_SPACE,
@@ -71,6 +71,14 @@ def test_gaussian_distribution():
                     assert out.shape[1] == VECTOR_ACTION_SPACE[0]
                 output = sess.run([distribution.total_log_probs])
                 assert output[0].shape[0] == 1
+            # Test entropy is correct
+            log_std_tensor = tf.get_default_graph().get_tensor_by_name(
+                "log_std/BiasAdd:0"
+            )
+            feed_dict = {log_std_tensor: [[1.0, 1.0]]}
+            entropy = sess.run([distribution.entropy], feed_dict=feed_dict)
+            # Entropy with log_std of 1.0 should be 2.42
+            assert pytest.approx(entropy[0], 0.01) == 2.42
 
 
 def test_tanh_distribution():
diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
index e8ba8e9957..a4280fcf17 100644
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -29,7 +29,7 @@
         lambd: 0.95
         learning_rate: 5.0e-3
         learning_rate_schedule: constant
-        max_steps: 2000
+        max_steps: 3000
         memory_size: 16
         normalize: false
         num_epoch: 3
@@ -89,6 +89,9 @@ def generate_config(
 # Custom reward processors shuld be built within the test function and passed to _check_environment_trains
 # Default is average over the last 5 final rewards
 def default_reward_processor(rewards, last_n_rewards=5):
+    rewards_to_use = rewards[-last_n_rewards:]
+    # For debugging tests
+    print("Last {} rewards:".format(last_n_rewards), rewards_to_use)
     return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
 
 
@@ -120,7 +123,7 @@ def _check_environment_trains(
     trainer_config,
     reward_processor=default_reward_processor,
     meta_curriculum=None,
-    success_threshold=0.99,
+    success_threshold=0.9,
     env_manager=None,
 ):
     # Create controller and begin training.
@@ -164,7 +167,6 @@ def _check_environment_trains(
         if (
             success_threshold is not None
         ):  # For tests where we are just checking setup and not reward
-
             processed_rewards = [
                 reward_processor(rewards) for rewards in env.final_rewards.values()
             ]
@@ -220,13 +222,14 @@ def test_visual_advanced_ppo(vis_encode_type, num_visual):
 def test_recurrent_ppo(use_discrete):
     env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
     override_vals = {
-        "max_steps": 3000,
+        "max_steps": 5000,
         "batch_size": 64,
         "buffer_size": 128,
+        "learning_rate": 1e-3,
         "use_recurrent": True,
     }
     config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    _check_environment_trains(env, config, success_threshold=0.9)
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])
@@ -274,14 +277,6 @@ def test_visual_advanced_sac(vis_encode_type, num_visual):
     _check_environment_trains(env, config, success_threshold=0.5)
 
 
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_recurrent_sac(use_discrete):
-    env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    override_vals = {"batch_size": 32, "use_recurrent": True, "max_steps": 2000}
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config)
-
-
 @pytest.mark.parametrize("use_discrete", [True, False])
 def test_simple_ghost(use_discrete):
     env = Simple1DEnvironment(
@@ -319,7 +314,7 @@ def test_simple_ghost_fails(use_discrete):
     processed_rewards = [
         default_reward_processor(rewards) for rewards in env.final_rewards.values()
     ]
-    success_threshold = 0.99
+    success_threshold = 0.9
     assert any(reward > success_threshold for reward in processed_rewards) and any(
         reward < success_threshold for reward in processed_rewards
     )
diff --git a/ml-agents/mlagents/trainers/trainer/trainer.py b/ml-agents/mlagents/trainers/trainer/trainer.py
index a6c18cae94..9fd7ac0c14 100644
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
@@ -1,5 +1,4 @@
 # # Unity ML-Agents Toolkit
-import logging
 from typing import Dict, List, Deque, Any
 import time
 import abc
@@ -10,6 +9,7 @@
 from collections import deque
 
 from mlagents_envs.timers import set_gauge
+from mlagents_envs.logging_util import get_logger
 from mlagents.model_serialization import export_policy_model, SerializationSettings
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.stats import StatsReporter
@@ -20,7 +20,8 @@
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents_envs.timers import hierarchical_timer
 
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)
 
 
 class Trainer(abc.ABC):
diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
index 9a5b50b9b3..0d83161398 100644
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -4,13 +4,13 @@
 
 import os
 import sys
-import logging
 from typing import Dict, Optional, Set
 from collections import defaultdict
 
 import numpy as np
 from mlagents.tf_utils import tf
 
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.env_manager import EnvManager
 from mlagents_envs.exception import (
     UnityEnvironmentException,
@@ -55,7 +55,7 @@ def __init__(
         self.trainer_factory = trainer_factory
         self.model_path = model_path
         self.summaries_dir = summaries_dir
-        self.logger = logging.getLogger("mlagents.trainers")
+        self.logger = get_logger(__name__)
         self.run_id = run_id
         self.save_freq = save_freq
         self.train_model = train
diff --git a/ml-agents/mlagents/trainers/trainer_util.py b/ml-agents/mlagents/trainers/trainer_util.py
index a112da05c5..87849cbdcb 100644
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
@@ -1,8 +1,8 @@
 import os
 import yaml
 from typing import Any, Dict, TextIO
-import logging
 
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.exception import TrainerConfigError
 from mlagents.trainers.trainer import Trainer
@@ -11,7 +11,8 @@
 from mlagents.trainers.sac.trainer import SACTrainer
 from mlagents.trainers.ghost.trainer import GhostTrainer
 
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)
 
 
 class TrainerFactory:
diff --git a/setup.cfg b/setup.cfg
index e728ceb30e..ecd551bdd7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,3 +20,4 @@ ignore =
     I200,
 
 banned-modules = tensorflow = use mlagents.tf_utils instead (it handles tf2 compat).
+                 logging = use mlagents_envs.logging_util instead