-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvpg.py
89 lines (73 loc) · 3.04 KB
/
vpg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Vanilla policy gradient implementation.
Not intended for production use.
"""
import numpy as np
from models import MultilayerPerceptron
from utils import discount_rewards, run_episode
class VPG():
"""Vanilla policy gradient.
Our objective function that we want to maximize is E[R(tau)], or the
expected value of the return of trajectory tau with respect to the policy.
Using the Policy Gradient Theorem, it can be mathematically shown that the
gradient of this function takes the form
∇ E[R(tau)] = E[ Σ_{t=0}^{T-1} ∇ log pi(a_t|s_t) R_t ]
where R is the total discounted reward following action a_t, or
R_t = Σ_{t'=t}^{T-1} gamma^(t'-t) * R(s_t', a_t')
We estimate the expectation in the policy gradient via Monte Carlo
(ie. we take N rollouts and take the average gradient of the N rollouts).
Then, we perform gradient ascent.
Attributes:
env: Gym environment.
policy: Policy network (ie. actor network).
lr: Learning rate.
gamma: Discount factor.
use_baseline: Use baseline for variance reduction.
max_episodes: Maximum number of episodes to train.
value: Value network (ie. critic network).
"""
def __init__(
self,
env,
policy,
lr=0.002,
gamma=0.99,
use_baseline=True,
max_episodes=2000):
self.env = env
self.policy = policy
self.lr = lr
self.gamma = gamma
self.use_baseline = use_baseline
self.max_episodes = max_episodes
self.value = MultilayerPerceptron(
input_units=env.observation_space.shape[0], output_units=1,
hidden_units=4, num_hidden=1, lr=lr)
def train(self):
episode_returns = []
for i in range(self.max_episodes):
total_reward, rewards, observations, actions, probs = run_episode(
self.env, self.policy)
episode_returns.append(total_reward)
self.update(rewards, observations, actions)
if i % 100 == 0:
print('Episode #' + str(i) + '\tScore: ' + str(total_reward))
return episode_returns
def update(self, rewards, obs, actions):
grad_log_p = np.array([self.policy.grad_log_p(ob)[action]
for ob, action in zip(obs, actions)])
discounted_returns = discount_rewards(rewards, self.gamma)
values = self.value.predict(obs)
# If using baseline for variance reduction, subtract value estimates
# from empirical return. Else just use empirical return.
if self.use_baseline:
gradient = grad_log_p.T @ (discounted_returns - values.T).ravel()
else:
gradient = grad_log_p.T @ discounted_returns
# Actor training step
self.policy.theta += self.lr * gradient
# Critic training step
value_loss = self.value.loss(values, discounted_returns[:, None])
self.value.update(values, discounted_returns[:, None])