Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ depend on the previous behavior, you can explicitly set the Agent's `InferenceDe
settings. Unfortunately, this may require retraining models if it changes the resulting order of the sensors
or actuators on your system. (#5194)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- Fixed an issue which was causing increased variance when using LSTMs. Also fixed an issue with LSTM when used with POCA and `sequence_length` < `time_horizon`. (#5206)
- Fixed a bug where the SAC replay buffer would not be saved out at the end of a run, even if `save_replay_buffer` was enabled. (#5205)

## [1.9.0-preview] - 2021-03-17
Expand Down
2 changes: 1 addition & 1 deletion config/ppo/Hallway.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ behaviors:
batch_size: 128
buffer_size: 1024
learning_rate: 0.0003
beta: 0.01
beta: 0.03
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add in the documentation that it may be good practice to increase beta with LSTM? Or do we think that this is a peculiarity of Hallway?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's b/c of Hallway's sparse reward and reward structure that has a bunch of local maxima (at least 2, given the reward curve). Hard to say if we'd need to increase beta for other envs.
Our betas might just be too low across-the-board.

epsilon: 0.2
lambd: 0.95
num_epoch: 3
Expand Down
2 changes: 1 addition & 1 deletion ml-agents/mlagents/trainers/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def get_batch(
else:
# We want to duplicate the last value in the array, multiplied by the padding_value.
padding = np.array(self[-1], dtype=np.float32) * self.padding_value
return [padding] * (training_length - leftover) + self[:]
return self[:] + [padding] * (training_length - leftover)

else:
return self[len(self) - batch_size * training_length :]
Expand Down
71 changes: 35 additions & 36 deletions ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Dict, Optional, Tuple, List
from mlagents.torch_utils import torch
import numpy as np
import math
from collections import defaultdict

from mlagents.trainers.buffer import AgentBuffer, AgentBufferField
from mlagents.trainers.trajectory import ObsUtil
Expand Down Expand Up @@ -76,53 +76,52 @@ def _evaluate_by_sequence(
"""
num_experiences = tensor_obs[0].shape[0]
all_next_memories = AgentBufferField()
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
# trajectory is of length 10, the 1st sequence is [pad,pad,obs].
# Compute the number of elements in this padded seq.
leftover = num_experiences % self.policy.sequence_length

# Compute values for the potentially truncated initial sequence
seq_obs = []

first_seq_len = leftover if leftover > 0 else self.policy.sequence_length
for _obs in tensor_obs:
first_seq_obs = _obs[0:first_seq_len]
seq_obs.append(first_seq_obs)

# For the first sequence, the initial memory should be the one at the
# beginning of this trajectory.
for _ in range(first_seq_len):
all_next_memories.append(ModelUtils.to_numpy(initial_memory.squeeze()))

init_values, _mem = self.critic.critic_pass(
seq_obs, initial_memory, sequence_length=first_seq_len
)
all_values = {
signal_name: [init_values[signal_name]]
for signal_name in init_values.keys()
}

# When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes,
# that division isn't even, and we must pad the leftover sequence.
# When it is added to the buffer, the last sequence will be padded. So if seq_len = 3 and
# trajectory is of length 10, the last sequence is [obs,pad,pad] once it is added to the buffer.
# Compute the number of elements in this sequence that will end up being padded.
leftover_seq_len = num_experiences % self.policy.sequence_length

all_values: Dict[str, List[np.ndarray]] = defaultdict(list)
_mem = initial_memory
# Evaluate other trajectories, carrying over _mem after each
# trajectory
for seq_num in range(
1, math.ceil((num_experiences) / (self.policy.sequence_length))
):
for seq_num in range(num_experiences // self.policy.sequence_length):
seq_obs = []
for _ in range(self.policy.sequence_length):
all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
start = seq_num * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
end = (seq_num + 1) * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
start = seq_num * self.policy.sequence_length
end = (seq_num + 1) * self.policy.sequence_length

for _obs in tensor_obs:
seq_obs.append(_obs[start:end])
values, _mem = self.critic.critic_pass(
seq_obs, _mem, sequence_length=self.policy.sequence_length
)
for signal_name, _val in values.items():
all_values[signal_name].append(_val)

# Compute values for the potentially truncated last sequence. Note that this
# sequence isn't padded yet, but will be.
seq_obs = []

if leftover_seq_len > 0:
for _obs in tensor_obs:
last_seq_obs = _obs[-leftover_seq_len:]
seq_obs.append(last_seq_obs)

# For the last sequence, the initial memory should be the one at the
# end of this trajectory.
for _ in range(leftover_seq_len):
all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does it matter what the memory in the padding is? Can we make this all zeros ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't in the padding, as the padding isn't added to the trajectory until later. This method only has unpadded data. Added a comment

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand that comment :

            # For the last sequence, the initial memory should be the one at the
            # end of this trajectory.

Can you give more info ?


last_values, _mem = self.critic.critic_pass(
seq_obs, _mem, sequence_length=leftover_seq_len
)
for signal_name, _val in last_values.items():
all_values[signal_name].append(_val)

# Create one tensor per reward signal
all_value_tensors = {
signal_name: torch.cat(value_list, dim=0)
Expand Down
147 changes: 70 additions & 77 deletions ml-agents/mlagents/trainers/poca/optimizer_torch.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import Dict, cast, List, Tuple, Optional
from collections import defaultdict
from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import (
ExtrinsicRewardProvider,
)
import numpy as np
import math
from mlagents.torch_utils import torch, default_device

from mlagents.trainers.buffer import (
Expand Down Expand Up @@ -381,116 +381,109 @@ def _evaluate_by_sequence_team(
num_experiences = self_obs[0].shape[0]
all_next_value_mem = AgentBufferField()
all_next_baseline_mem = AgentBufferField()
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
# trajectory is of length 10, the 1st sequence is [pad,pad,obs].
# Compute the number of elements in this padded seq.
leftover = num_experiences % self.policy.sequence_length

# Compute values for the potentially truncated initial sequence

first_seq_len = leftover if leftover > 0 else self.policy.sequence_length

self_seq_obs = []
groupmate_seq_obs = []
groupmate_seq_act = []
seq_obs = []
for _self_obs in self_obs:
first_seq_obs = _self_obs[0:first_seq_len]
seq_obs.append(first_seq_obs)
self_seq_obs.append(seq_obs)

for groupmate_obs, groupmate_action in zip(obs, actions):
seq_obs = []
for _obs in groupmate_obs:
first_seq_obs = _obs[0:first_seq_len]
seq_obs.append(first_seq_obs)
groupmate_seq_obs.append(seq_obs)
_act = groupmate_action.slice(0, first_seq_len)
groupmate_seq_act.append(_act)

# For the first sequence, the initial memory should be the one at the
# beginning of this trajectory.
for _ in range(first_seq_len):
all_next_value_mem.append(ModelUtils.to_numpy(init_value_mem.squeeze()))
all_next_baseline_mem.append(
ModelUtils.to_numpy(init_baseline_mem.squeeze())
)

all_seq_obs = self_seq_obs + groupmate_seq_obs
init_values, _value_mem = self.critic.critic_pass(
all_seq_obs, init_value_mem, sequence_length=first_seq_len
)
all_values = {
signal_name: [init_values[signal_name]]
for signal_name in init_values.keys()
}
# When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes,
# that division isn't even, and we must pad the leftover sequence.
# In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mention in this comment that this is about LSTM and memories.

# trajectory is of length 10, the last sequence is [obs,pad,pad].
# Compute the number of elements in this padded seq.
leftover_seq_len = num_experiences % self.policy.sequence_length

groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
init_baseline, _baseline_mem = self.critic.baseline(
self_seq_obs[0],
groupmate_obs_and_actions,
init_baseline_mem,
sequence_length=first_seq_len,
)
all_baseline = {
signal_name: [init_baseline[signal_name]]
for signal_name in init_baseline.keys()
}
all_values: Dict[str, List[np.ndarray]] = defaultdict(list)
all_baseline: Dict[str, List[np.ndarray]] = defaultdict(list)
_baseline_mem = init_baseline_mem
_value_mem = init_value_mem

# Evaluate other trajectories, carrying over _mem after each
# trajectory
for seq_num in range(
1, math.ceil((num_experiences) / (self.policy.sequence_length))
):
for seq_num in range(num_experiences // self.policy.sequence_length):
for _ in range(self.policy.sequence_length):
all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
all_next_baseline_mem.append(
ModelUtils.to_numpy(_baseline_mem.squeeze())
)

start = seq_num * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
end = (seq_num + 1) * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
start = seq_num * self.policy.sequence_length
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like there is a bunch of duplicate code with this part : https://github.com/Unity-Technologies/ml-agents/pull/5206/files#diff-01b42574a4de05de250e29039fc1bbc67b200f3a7c5ed0676b2eee549b943c11R95
Is it possible to combine some of it ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's annoyingly similar but just slightly different enough to make it really hard to combine. The POCA version has to operate on group obs and actions as well 🤔

end = (seq_num + 1) * self.policy.sequence_length

self_seq_obs = []
groupmate_seq_obs = []
groupmate_seq_act = []
seq_obs = []
for _self_obs in self_obs:
seq_obs.append(_obs[start:end])
seq_obs.append(_self_obs[start:end])
self_seq_obs.append(seq_obs)

for groupmate_obs, team_action in zip(obs, actions):
for groupmate_obs, groupmate_action in zip(obs, actions):
seq_obs = []
for (_obs,) in groupmate_obs:
first_seq_obs = _obs[start:end]
seq_obs.append(first_seq_obs)
for _obs in groupmate_obs:
sliced_seq_obs = _obs[start:end]
seq_obs.append(sliced_seq_obs)
groupmate_seq_obs.append(seq_obs)
_act = team_action.slice(start, end)
_act = groupmate_action.slice(start, end)
groupmate_seq_act.append(_act)

all_seq_obs = self_seq_obs + groupmate_seq_obs
values, _value_mem = self.critic.critic_pass(
all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length
)
all_values = {
signal_name: [init_values[signal_name]] for signal_name in values.keys()
}
for signal_name, _val in values.items():
all_values[signal_name].append(_val)

groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
baselines, _baseline_mem = self.critic.baseline(
self_seq_obs[0],
groupmate_obs_and_actions,
_baseline_mem,
sequence_length=first_seq_len,
sequence_length=self.policy.sequence_length,
)
for signal_name, _val in baselines.items():
all_baseline[signal_name].append(_val)

# Compute values for the potentially truncated initial sequence
if leftover_seq_len > 0:
self_seq_obs = []
groupmate_seq_obs = []
groupmate_seq_act = []
seq_obs = []
for _self_obs in self_obs:
last_seq_obs = _self_obs[-leftover_seq_len:]
seq_obs.append(last_seq_obs)
self_seq_obs.append(seq_obs)

for groupmate_obs, groupmate_action in zip(obs, actions):
seq_obs = []
for _obs in groupmate_obs:
last_seq_obs = _obs[-leftover_seq_len:]
seq_obs.append(last_seq_obs)
groupmate_seq_obs.append(seq_obs)
_act = groupmate_action.slice(len(_obs) - leftover_seq_len, len(_obs))
groupmate_seq_act.append(_act)

# For the last sequence, the initial memory should be the one at the
# beginning of this trajectory.
seq_obs = []
for _ in range(leftover_seq_len):
all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
all_next_baseline_mem.append(
ModelUtils.to_numpy(_baseline_mem.squeeze())
Comment on lines +467 to +469
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we padding with this value and not NaN or zeros ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sequence at this point isn't padded - it's just the leftover bit that would go into a new sequence. E.g. if there were three sequences of length "2" [[s0, s1],[s2, s3],[s4]], at this point this is just s4. When we pad in the buffer, it will become [s4, 0], but not at this point.

With that said, only the 1st memory of each sequence is used. I'm repeating the memory rather than using zeros or ones so we don't need to allocate a new numpy array every time.

)

all_seq_obs = self_seq_obs + groupmate_seq_obs
last_values, _value_mem = self.critic.critic_pass(
all_seq_obs, _value_mem, sequence_length=leftover_seq_len
)
for signal_name, _val in last_values.items():
all_values[signal_name].append(_val)
groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
last_baseline, _baseline_mem = self.critic.baseline(
self_seq_obs[0],
groupmate_obs_and_actions,
_baseline_mem,
sequence_length=leftover_seq_len,
)
all_baseline = {
signal_name: [baselines[signal_name]]
for signal_name in baselines.keys()
}
for signal_name, _val in last_baseline.items():
all_baseline[signal_name].append(_val)
# Create one tensor per reward signal
all_value_tensors = {
signal_name: torch.cat(value_list, dim=0)
Expand Down
12 changes: 6 additions & 6 deletions ml-agents/mlagents/trainers/tests/test_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,6 @@ def test_buffer():
np.array(a),
np.array(
[
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[201, 202, 203],
[211, 212, 213],
[221, 222, 223],
Expand All @@ -122,17 +119,20 @@ def test_buffer():
[261, 262, 263],
[271, 272, 273],
[281, 282, 283],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
]
),
)
# Test group entries return Lists of Lists. Make sure to pad properly!
a = agent_2_buffer[BufferKey.GROUP_CONTINUOUS_ACTION].get_batch(
batch_size=None, training_length=4, sequential=True
)
for _group_entry in a[:3]:
assert len(_group_entry) == 0
for _group_entry in a[3:]:
for _group_entry in a[:-3]:
assert len(_group_entry) == 3
for _group_entry in a[-3:]:
assert len(_group_entry) == 0

agent_1_buffer.reset_agent()
assert agent_1_buffer.num_experiences == 0
Expand Down
12 changes: 6 additions & 6 deletions ml-agents/mlagents/trainers/tests/torch/test_poca.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
}

trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
NetworkSettings.MemorySettings(sequence_length=8, memory_size=10)
if use_rnn
else None
)
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete):
optimizer = create_test_poca_optimizer(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
time_horizon = 15
time_horizon = 30
trajectory = make_fake_trajectory(
length=time_horizon,
observation_specs=optimizer.policy.behavior_spec.observation_specs,
Expand All @@ -147,14 +147,14 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete):
)
for key, val in value_estimates.items():
assert type(key) is str
assert len(val) == 15
assert len(val) == time_horizon
for key, val in baseline_estimates.items():
assert type(key) is str
assert len(val) == 15
assert len(val) == time_horizon

if value_memories is not None:
assert len(value_memories) == 15
assert len(baseline_memories) == 15
assert len(value_memories) == time_horizon
assert len(baseline_memories) == time_horizon

(
value_estimates,
Expand Down
Loading