-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathutils.py
127 lines (98 loc) · 3.79 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from config import *
import numpy as np
# if default_config['TrainMethod'] in ['PPO', 'ICM', 'RND']:
# num_step = int(ppo_config['NumStep'])
# else:
# num_step = int(default_config['NumStep'])
use_gae = default_config.getboolean('UseGAE')
lam = float(default_config['Lambda'])
train_method = default_config['TrainMethod']
def make_train_data(reward, done, value, next_value):
num_step = len(reward)
discounted_return = np.empty([num_step])
use_gae = True
use_standardization = False
gamma = 0.99
lam = 0.95
stable_eps = 1e-30
# Discounted Return
if use_gae:
gae = 0
for t in range(num_step - 1, -1, -1):
delta = reward[t] + gamma * \
next_value[t] * (1 - done[t]) - value[t]
gae = delta + gamma * lam * (1 - done[t]) * gae
discounted_return[t] = gae + value[t]
# For Actor
adv = discounted_return - value
else:
for t in range(num_step - 1, -1, -1):
running_add = reward[t] + gamma * next_value[t] * (1 - done[t])
discounted_return[t] = running_add
# For Actor
adv = discounted_return - value
if use_standardization:
adv = (adv - adv.mean()) / (adv.std() + stable_eps)
return discounted_return, adv
def make_train_data_icm(reward, done, value, gamma, num_step, num_worker):
discounted_return = np.empty([num_worker, num_step])
use_gae = True
lam = 0.95
# Discounted Return
if use_gae:
gae = np.zeros_like([num_worker, ])
for t in range(num_step - 1, -1, -1):
delta = reward[:, t] + gamma * value[:, t + 1] * (1 - done[:, t]) - value[:, t]
gae = delta + gamma * lam * (1 - done[:, t]) * gae
discounted_return[:, t] = gae + value[:, t]
# For Actor
adv = discounted_return - value[:, :-1]
else:
running_add = value[:, -1]
for t in range(num_step - 1, -1, -1):
running_add = reward[:, t] + gamma * running_add * (1 - done[:, t])
discounted_return[:, t] = running_add
# For Actor
adv = discounted_return - value[:, :-1]
return discounted_return.reshape([-1]), adv.reshape([-1])
class RunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
def __init__(self, epsilon=1e-4, shape=()):
self.mean = np.zeros(shape, 'float64')
self.var = np.ones(shape, 'float64')
self.count = epsilon
def update(self, x):
batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0)
batch_count = x.shape[0]
self.update_from_moments(batch_mean, batch_var, batch_count)
def update_from_moments(self, batch_mean, batch_var, batch_count):
delta = batch_mean - self.mean
tot_count = self.count + batch_count
new_mean = self.mean + delta * batch_count / tot_count
m_a = self.var * (self.count)
m_b = batch_var * (batch_count)
M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
new_var = M2 / (self.count + batch_count)
new_count = batch_count + self.count
self.mean = new_mean
self.var = new_var
self.count = new_count
class RewardForwardFilter(object):
def __init__(self, gamma):
self.rewems = None
self.gamma = gamma
def update(self, rews):
if self.rewems is None:
self.rewems = rews
else:
self.rewems = self.rewems * self.gamma + rews
return self.rewems
def softmax(z):
assert len(z.shape) == 2
s = np.max(z, axis=1)
s = s[:, np.newaxis] # necessary step to do broadcasting
e_x = np.exp(z - s)
div = np.sum(e_x, axis=1)
div = div[:, np.newaxis] # dito
return e_x / div