Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bug-fix] Fix POCA LSTM, pad sequences in the back #5206

Merged
merged 14 commits into from
Apr 5, 2021
2 changes: 1 addition & 1 deletion com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ depend on the previous behavior, you can explicitly set the Agent's `InferenceDe
### Bug Fixes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)

- Fixed an issue with LSTM when used with POCA and `sequence_length` < `time_horizon`. Also improved behavior of LSTMs slightly. (#5206)

## [1.9.0-preview] - 2021-03-17
### Major Changes
Expand Down
2 changes: 1 addition & 1 deletion ml-agents/mlagents/trainers/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def get_batch(
else:
# We want to duplicate the last value in the array, multiplied by the padding_value.
padding = np.array(self[-1], dtype=np.float32) * self.padding_value
return [padding] * (training_length - leftover) + self[:]
return self[:] + [padding] * (training_length - leftover)

else:
return self[len(self) - batch_size * training_length :]
Expand Down
60 changes: 30 additions & 30 deletions ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from mlagents.torch_utils import torch
import numpy as np
import math
from collections import defaultdict

from mlagents.trainers.buffer import AgentBuffer, AgentBufferField
from mlagents.trainers.trajectory import ObsUtil
Expand Down Expand Up @@ -76,53 +77,52 @@ def _evaluate_by_sequence(
"""
num_experiences = tensor_obs[0].shape[0]
all_next_memories = AgentBufferField()
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
# trajectory is of length 10, the 1st sequence is [pad,pad,obs].
# In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and
# trajectory is of length 10, the last sequence is [obs,pad,pad].
# Compute the number of elements in this padded seq.
leftover = num_experiences % self.policy.sequence_length

# Compute values for the potentially truncated initial sequence
seq_obs = []

first_seq_len = leftover if leftover > 0 else self.policy.sequence_length
for _obs in tensor_obs:
first_seq_obs = _obs[0:first_seq_len]
seq_obs.append(first_seq_obs)

# For the first sequence, the initial memory should be the one at the
# beginning of this trajectory.
for _ in range(first_seq_len):
all_next_memories.append(ModelUtils.to_numpy(initial_memory.squeeze()))

init_values, _mem = self.critic.critic_pass(
seq_obs, initial_memory, sequence_length=first_seq_len
)
all_values = {
signal_name: [init_values[signal_name]]
for signal_name in init_values.keys()
}

all_values: Dict[str, List[np.ndarray]] = defaultdict(list)
_mem = initial_memory
# Evaluate other trajectories, carrying over _mem after each
# trajectory
for seq_num in range(
1, math.ceil((num_experiences) / (self.policy.sequence_length))
0, math.floor((num_experiences) / (self.policy.sequence_length))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
0, math.floor((num_experiences) / (self.policy.sequence_length))
math.floor((num_experiences) / (self.policy.sequence_length))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
0, math.floor((num_experiences) / (self.policy.sequence_length))
num_experiences // self.policy.sequence_length

):
seq_obs = []
for _ in range(self.policy.sequence_length):
all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
start = seq_num * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
end = (seq_num + 1) * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
start = seq_num * self.policy.sequence_length
end = (seq_num + 1) * self.policy.sequence_length

for _obs in tensor_obs:
seq_obs.append(_obs[start:end])
values, _mem = self.critic.critic_pass(
seq_obs, _mem, sequence_length=self.policy.sequence_length
)
for signal_name, _val in values.items():
all_values[signal_name].append(_val)

# Compute values for the potentially truncated last sequence
seq_obs = []

last_seq_len = leftover
if last_seq_len > 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
last_seq_len = leftover
if last_seq_len > 0:
if leftover > 0:

for _obs in tensor_obs:
last_seq_obs = _obs[-last_seq_len:]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: why use last_seq_len instead of leftover. Or rename leftover when its first created.

seq_obs.append(last_seq_obs)

# For the last sequence, the initial memory should be the one at the
# end of this trajectory.
for _ in range(last_seq_len):
Copy link
Contributor

@andrewcoh andrewcoh Mar 31, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we add the last _mem for each extra leftover obs? Why isnt it enough to just append this once?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need the mem array to be the same length as all the other ones in the buffer. But you're right this isn't needed. To address @vincentpierre's comment above, we could also add _mem once and add a bunch of zeros/NaNs, but this saves us from having to create/allocate those empty arrays.

all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does it matter what the memory in the padding is? Can we make this all zeros ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't in the padding, as the padding isn't added to the trajectory until later. This method only has unpadded data. Added a comment

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand that comment :

            # For the last sequence, the initial memory should be the one at the
            # end of this trajectory.

Can you give more info ?


last_values, _mem = self.critic.critic_pass(
seq_obs, _mem, sequence_length=last_seq_len
)
for signal_name, _val in last_values.items():
all_values[signal_name].append(_val)

# Create one tensor per reward signal
all_value_tensors = {
signal_name: torch.cat(value_list, dim=0)
Expand Down
141 changes: 68 additions & 73 deletions ml-agents/mlagents/trainers/poca/optimizer_torch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict, cast, List, Tuple, Optional
from collections import defaultdict
from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import (
ExtrinsicRewardProvider,
)
Expand Down Expand Up @@ -381,116 +382,110 @@ def _evaluate_by_sequence_team(
num_experiences = self_obs[0].shape[0]
all_next_value_mem = AgentBufferField()
all_next_baseline_mem = AgentBufferField()
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
# trajectory is of length 10, the 1st sequence is [pad,pad,obs].
# In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mention in this comment that this is about LSTM and memories.

# trajectory is of length 10, the last sequence is [obs,pad,pad].
# Compute the number of elements in this padded seq.
leftover = num_experiences % self.policy.sequence_length

# Compute values for the potentially truncated initial sequence

first_seq_len = leftover if leftover > 0 else self.policy.sequence_length

self_seq_obs = []
groupmate_seq_obs = []
groupmate_seq_act = []
seq_obs = []
for _self_obs in self_obs:
first_seq_obs = _self_obs[0:first_seq_len]
seq_obs.append(first_seq_obs)
self_seq_obs.append(seq_obs)

for groupmate_obs, groupmate_action in zip(obs, actions):
seq_obs = []
for _obs in groupmate_obs:
first_seq_obs = _obs[0:first_seq_len]
seq_obs.append(first_seq_obs)
groupmate_seq_obs.append(seq_obs)
_act = groupmate_action.slice(0, first_seq_len)
groupmate_seq_act.append(_act)

# For the first sequence, the initial memory should be the one at the
# beginning of this trajectory.
for _ in range(first_seq_len):
all_next_value_mem.append(ModelUtils.to_numpy(init_value_mem.squeeze()))
all_next_baseline_mem.append(
ModelUtils.to_numpy(init_baseline_mem.squeeze())
)

all_seq_obs = self_seq_obs + groupmate_seq_obs
init_values, _value_mem = self.critic.critic_pass(
all_seq_obs, init_value_mem, sequence_length=first_seq_len
)
all_values = {
signal_name: [init_values[signal_name]]
for signal_name in init_values.keys()
}

groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
init_baseline, _baseline_mem = self.critic.baseline(
self_seq_obs[0],
groupmate_obs_and_actions,
init_baseline_mem,
sequence_length=first_seq_len,
)
all_baseline = {
signal_name: [init_baseline[signal_name]]
for signal_name in init_baseline.keys()
}
all_values: Dict[str, List[np.ndarray]] = defaultdict(list)
all_baseline: Dict[str, List[np.ndarray]] = defaultdict(list)
_baseline_mem = init_baseline_mem
_value_mem = init_value_mem

# Evaluate other trajectories, carrying over _mem after each
# trajectory
for seq_num in range(
1, math.ceil((num_experiences) / (self.policy.sequence_length))
0, math.floor((num_experiences) / (self.policy.sequence_length))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be simplified now.

):
for _ in range(self.policy.sequence_length):
all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
all_next_baseline_mem.append(
ModelUtils.to_numpy(_baseline_mem.squeeze())
)

start = seq_num * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
end = (seq_num + 1) * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
start = seq_num * self.policy.sequence_length
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like there is a bunch of duplicate code with this part : https://github.com/Unity-Technologies/ml-agents/pull/5206/files#diff-01b42574a4de05de250e29039fc1bbc67b200f3a7c5ed0676b2eee549b943c11R95
Is it possible to combine some of it ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's annoyingly similar but just slightly different enough to make it really hard to combine. The POCA version has to operate on group obs and actions as well 🤔

end = (seq_num + 1) * self.policy.sequence_length

self_seq_obs = []
groupmate_seq_obs = []
groupmate_seq_act = []
seq_obs = []
for _self_obs in self_obs:
seq_obs.append(_obs[start:end])
seq_obs.append(_self_obs[start:end])
self_seq_obs.append(seq_obs)

for groupmate_obs, team_action in zip(obs, actions):
for groupmate_obs, groupmate_action in zip(obs, actions):
seq_obs = []
for (_obs,) in groupmate_obs:
first_seq_obs = _obs[start:end]
seq_obs.append(first_seq_obs)
for _obs in groupmate_obs:
sliced_seq_obs = _obs[start:end]
seq_obs.append(sliced_seq_obs)
groupmate_seq_obs.append(seq_obs)
_act = team_action.slice(start, end)
_act = groupmate_action.slice(start, end)
groupmate_seq_act.append(_act)

all_seq_obs = self_seq_obs + groupmate_seq_obs
values, _value_mem = self.critic.critic_pass(
all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length
)
all_values = {
signal_name: [init_values[signal_name]] for signal_name in values.keys()
}
for signal_name, _val in values.items():
all_values[signal_name].append(_val)

groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
baselines, _baseline_mem = self.critic.baseline(
self_seq_obs[0],
groupmate_obs_and_actions,
_baseline_mem,
sequence_length=first_seq_len,
sequence_length=self.policy.sequence_length,
)
for signal_name, _val in baselines.items():
all_baseline[signal_name].append(_val)

# Compute values for the potentially truncated initial sequence
last_seq_len = leftover
if last_seq_len > 0:
self_seq_obs = []
groupmate_seq_obs = []
groupmate_seq_act = []
seq_obs = []
for _self_obs in self_obs:
last_seq_obs = _self_obs[-last_seq_len:]
seq_obs.append(last_seq_obs)
self_seq_obs.append(seq_obs)

for groupmate_obs, groupmate_action in zip(obs, actions):
seq_obs = []
for _obs in groupmate_obs:
last_seq_obs = _obs[-last_seq_len:]
seq_obs.append(last_seq_obs)
groupmate_seq_obs.append(seq_obs)
_act = groupmate_action.slice(len(_obs) - last_seq_len, len(_obs))
groupmate_seq_act.append(_act)

# For the last sequence, the initial memory should be the one at the
# beginning of this trajectory.
seq_obs = []
last_seq_len = leftover
for _ in range(last_seq_len):
all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
all_next_baseline_mem.append(
ModelUtils.to_numpy(_baseline_mem.squeeze())
Comment on lines +467 to +469
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we padding with this value and not NaN or zeros ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sequence at this point isn't padded - it's just the leftover bit that would go into a new sequence. E.g. if there were three sequences of length "2" [[s0, s1],[s2, s3],[s4]], at this point this is just s4. When we pad in the buffer, it will become [s4, 0], but not at this point.

With that said, only the 1st memory of each sequence is used. I'm repeating the memory rather than using zeros or ones so we don't need to allocate a new numpy array every time.

)

all_seq_obs = self_seq_obs + groupmate_seq_obs
last_values, _value_mem = self.critic.critic_pass(
all_seq_obs, _value_mem, sequence_length=last_seq_len
)
for signal_name, _val in last_values.items():
all_values[signal_name].append(_val)
groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
last_baseline, _baseline_mem = self.critic.baseline(
self_seq_obs[0],
groupmate_obs_and_actions,
_baseline_mem,
sequence_length=last_seq_len,
)
all_baseline = {
signal_name: [baselines[signal_name]]
for signal_name in baselines.keys()
}
for signal_name, _val in last_baseline.items():
all_baseline[signal_name].append(_val)
# Create one tensor per reward signal
all_value_tensors = {
signal_name: torch.cat(value_list, dim=0)
Expand Down
4 changes: 2 additions & 2 deletions ml-agents/mlagents/trainers/tests/torch/test_poca.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
}

trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
NetworkSettings.MemorySettings(sequence_length=8, memory_size=10)
if use_rnn
else None
)
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete):
optimizer = create_test_poca_optimizer(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
time_horizon = 15
time_horizon = 30
trajectory = make_fake_trajectory(
length=time_horizon,
observation_specs=optimizer.policy.behavior_spec.observation_specs,
Expand Down
8 changes: 5 additions & 3 deletions ml-agents/mlagents/trainers/tests/torch/test_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,9 @@ def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete):
optimizer = create_test_ppo_optimizer(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
time_horizon = 15
# Time horizon is longer than sequence length, make sure to test
# process trajectory on multiple sequences in trajectory + some padding
time_horizon = 30
trajectory = make_fake_trajectory(
length=time_horizon,
observation_specs=optimizer.policy.behavior_spec.observation_specs,
Expand All @@ -214,9 +216,9 @@ def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete):

for key, val in run_out.items():
assert type(key) is str
assert len(val) == 15
assert len(val) == time_horizon
if all_memories is not None:
assert len(all_memories) == 15
assert len(all_memories) == time_horizon

run_out, final_value_out, _ = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=True
Expand Down