Skip to content

Commit

Permalink
Merge pull request #8 from djbyrne/feature/vpg_base
Browse files Browse the repository at this point in the history
Feature/vpg base
  • Loading branch information
djbyrne authored Jun 5, 2020
2 parents 78201b5 + 64c9183 commit c1a2fa2
Show file tree
Hide file tree
Showing 12 changed files with 599 additions and 23 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.pyc
*.py[cod]
*$py.class

Expand Down
8 changes: 5 additions & 3 deletions algos/common/experience.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from torch.utils.data import IterableDataset

from algos.common.agents import Agent
from algos.common.memory import Experience
from algos.common.memory import Experience, MeanBuffer


class OnPolicyExperienceStream(IterableDataset):
class EpisodicExperienceStream(IterableDataset):
"""
Basic experience stream that iteratively yield the current experience of the agent in the env
Expand All @@ -17,12 +17,13 @@ class OnPolicyExperienceStream(IterableDataset):
agent: Agent being used to make decisions
"""

def __init__(self, env: Env, agent: Agent, episodes: int = 1):
def __init__(self, env: Env, agent: Agent, episodes: int = 1, reward_buffer_size=1000000):
self.env = env
self.agent = agent
self.state = self.env.reset()
self.episodes = episodes
self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
self.reward_buffer = MeanBuffer(reward_buffer_size)

def __getitem__(self, item):
return item
Expand Down Expand Up @@ -50,6 +51,7 @@ def step(self) -> Experience:
"""Carries out a single step in the environment"""
action = self.agent(self.state, self.device)
new_state, reward, done, _ = self.env.step(action)
self.reward_buffer.add(reward)
experience = Experience(state=self.state, action=action, reward=reward, new_state=new_state, done=done)
self.state = new_state

Expand Down
23 changes: 23 additions & 0 deletions algos/common/memory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Series of memory buffers sued"""

# Named tuple for storing experience steps gathered in training
import collections
from typing import Tuple, List
from collections import deque, namedtuple

Expand Down Expand Up @@ -127,6 +128,28 @@ def sample(self, batch_size: int) -> Tuple:
raise Exception('Buffer length is less than the batch size')


class MeanBuffer:
"""
Stores a deque of items and calculates the mean
"""
def __init__(self, capacity):
self.capacity = capacity
self.deque = collections.deque(maxlen=capacity)
self.sum = 0.0

def add(self, val: float) -> None:
"""Add to the buffer"""
if len(self.deque) == self.capacity:
self.sum -= self.deque[0]
self.deque.append(val)
self.sum += val

def mean(self) -> float:
"""Retrieve the mean"""
if not self.deque:
return 0.0
return self.sum / len(self.deque)

class PERBuffer:
"""simple list based Prioritized Experience Replay Buffer"""

Expand Down
21 changes: 7 additions & 14 deletions algos/reinforce/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,22 @@
see the metrics:
tensorboard --logdir default
"""
import argparse
from collections import OrderedDict
from copy import deepcopy
from itertools import chain
from typing import Tuple, List
import argparse
from collections import OrderedDict

import gym
import torch
from torch import Tensor
import torch.optim as optim
from torch import Tensor
from torch.nn.functional import log_softmax
from torch.optim import Optimizer
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import gym

from algos.common.agents import PolicyAgent
from algos.common.experience import OnPolicyExperienceStream
from algos.common.experience import EpisodicExperienceStream
from algos.common.memory import Experience
from algos.common.networks import MLP
from algos.common.wrappers import ToTensor
Expand Down Expand Up @@ -172,13 +172,6 @@ def loss(self, batch_qvals: List[Tensor], batch_states: List[Tensor], batch_acti
Returns:
loss
"""
assert len(batch_states.shape) == 2
assert isinstance(batch_states, Tensor)
assert len(batch_qvals.shape) == 1
assert isinstance(batch_qvals, Tensor)
assert len(batch_actions.shape) == 1
assert isinstance(batch_actions, Tensor)

logits = self.net(batch_states)
log_prob = log_softmax(logits, dim=1)
log_prob_actions = batch_qvals * log_prob[range(len(batch_states)), batch_actions]
Expand Down Expand Up @@ -231,7 +224,7 @@ def configure_optimizers(self) -> List[Optimizer]:

def _dataloader(self) -> DataLoader:
"""Initialize the Replay Buffer dataset used for retrieving experiences"""
dataset = OnPolicyExperienceStream(self.env, self.agent, episodes=self.hparams.batch_episodes)
dataset = EpisodicExperienceStream(self.env, self.agent, episodes=self.hparams.batch_episodes)
dataloader = DataLoader(dataset=dataset)
return dataloader

Expand Down
158 changes: 158 additions & 0 deletions algos/vanilla_policy_gradient/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# N Step DQN

N Step DQN was introduced in [Learning to Predict by the Methods
of Temporal Differences
](http://incompleteideas.net/papers/sutton-88-with-erratum.pdf). This method improves upon the original DQN by updating
our Q values with the expected reward from multiple steps in the
future as opposed to the expected reward from the immediate next state. When getting the Q values for a state action
pair using a single step which looks like this

<img src="https://latex.codecogs.com/svg.latex?\Large&space;Q(s_t,a_t)=r_t+{\gamma}\max_aQ(s_t+1,a_t+1)"/>

but because the Q function is recursive we can continue to roll this out into multiple steps, looking at the expected
return for each step into the future.

<img src="https://latex.codecogs.com/svg.latex?\Large&space;Q(s_t,a_t)=r_t+{\gamma}r_{t+1}+{\gamma}^2\max_{a'}Q(s_{t+2},a')"/>

The above example shows a 2-Step look ahead, but this could be rolled out to the end of the episode, which is just
Monte Carlo learning. Although we could just do a monte carlo update and look forward to the end of the episode, it
wouldn't be a good idea. Every time we take another step into the future, we are basing our approximation off our
current policy. For a large portion of training, our policy is going to be less than optimal. For example, at the start
of training, our policy will be in a state of high exploration, and will be little better than random.

---
**NOTE**

For each rollout step you must scale the discount factor accordingly by the number of steps. As you can see from the
equation above, the second gamma value is to the power of 2. If we rolled this out one step further, we would use
gamma to the power of 3 and so.

---

So if we are aproximating future rewards off a bad policy, chances are those approximations are going to be pretty
bad and every time we unroll our update equation, the worse it will get. The fact that we are using an off policy method
like DQN with a large replay buffer will make this even worse, as there is a high chance that we will be training on
experiences using an old policy that was worse than our current policy.

So we need to strike a balance between looking far enough ahead to improve the convergence of our agent, but not so far
that are updates become unstable. In general, small values of 2-4 work best.

### Benefits

- Multi-Step learning is capable of learning faster than typical 1 step learning methods.
- Note that this method introduces a new hyperparameter n. Although n=4 is generally a good starting point and provides
good results across the board.

### Implementation

#### Multi Step Buffer

The only thing we need to change for the N Step DQN is the buffer. We need a multi step
buffer that combines n-steps into a single experience. This requires the following

##### N Step Buffer:

Unlike the standard buffer, we need to use 2 buffers. One to store the n step roll outs
and another to hold the accumulated multi step experience.

##### Append:
The append function needs to be changed. If the n_step_buffer is not full, i.e we dont have
enough experiences to make a multi step experience, then we just append our current experience
to the n_step_buffer.

If the n_step_buffer has enough experiences, then we can take the last n steps and form an
accumulate multi step experience to be added to the buffer.

The multi step experience will look like the following:

- State = the state at the start of the n_step buffer
- Action = the action at the start of the n_step_buffer
- Reward = is the accumulated discounted reward over the last n steps
- Next State = the next state at the end of the n_step_buffer
- Done = the done flag at the end of the n_step_buffer

````python

def append(self, experience) -> None:
"""
add an experience to the buffer by collecting n steps of experiences
Args:
experience: tuple (state, action, reward, done, next_state)
"""
self.n_step_buffer.append(experience)

if len(self.n_step_buffer) >= self.n_step:
reward, next_state, done = self.get_transition_info()
first_experience = self.n_step_buffer[0]
multi_step_experience = Experience(first_experience.state,
first_experience.action,
reward,
done,
next_state)

self.buffer.append(multi_step_experience)

def get_transition_info(self, gamma=0.9) -> Tuple[np.float, np.array, np.int]:
"""
get the accumulated transition info for the n_step_buffer
Args:
gamma: discount factor
Returns:
multi step reward, final observation and done
"""
last_experience = self.n_step_buffer[-1]
final_state = last_experience.new_state
done = last_experience.done
reward = last_experience.reward

# calculate reward
# in reverse order, go through all the experiences up till the first experience
for experience in reversed(list(self.n_step_buffer)[:-1]):
reward_t = experience.reward
new_state_t = experience.new_state
done_t = experience.done

reward = reward_t + gamma * reward * (1 - done_t)
final_state, done = (new_state_t, done_t) if done_t else (final_state, done)

return reward, final_state, done
````

##### Sample

The sample function will behave as normal

A snippet of the code for the N Step Replay Buffer is shown below

## Results

As excpected, the N-Step DQN converges much faster than the standard DQN, however it also adds more instability to the
loss of the agent. This can be seen in the following experiments.

### Pong

#### N-Step DQN

The N-Step DQN shows the greates increase in performance with respect to the other DQN variations. After less than 150k steps the agent begins to
consistently win games and achieves the top score after ~170K steps. This is reflected in the sharp peak of the
total episode steps and of course, the total episode rewards.

![N-Step DQN Baseline Results](../../docs/images/pong_nstep_dqn_1.png)

#### DQN vs N-Step DQN

This improvement is shown in stark contrast to the base DQN, which only begins to win games after 250k steps and
requires over twice as many steps (450k) as the N-Step agent to achieve the high score of 21. One important thing to
notice is the large increase in the loss of the N-Step agent. This is expected as the agent is building
its expected reward off approximations of the future states. The large the size of N, the greater the instability.
Previous literature, listed below, shows the best results for the Pong environment with an N step between 3-5. For these
experiments I opted with an N step of 4.

![N-Step DQN Baseline Results vs DQN Baseline Results](../../docs/images/pong_nstep_dqn_2.png)


## References
- [Learning to Predict by the Methods of Temporal Differences ](http://incompleteideas.net/papers/sutton-88-with-erratum.pdf)
- [Deep Reinforcement Learning Hands On: Second Edition - Chapter 08 ](https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition)
- [Rainbow Is All You Need](https://github.com/Curt-Park/rainbow-is-all-you-need/blob/master/07.n_step_learning.ipynb)
Empty file.
Loading

0 comments on commit c1a2fa2

Please sign in to comment.