-
Notifications
You must be signed in to change notification settings - Fork 0
/
agent.py
315 lines (257 loc) · 12.2 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import json
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from IPython.display import clear_output
from networks import Actor, Critic
from ppo_memory import Memory
class Agent(object):
def __init__(self,
params: dict,
env,
obs_dim,
act_dim,
score_thresh: float,
is_evaluate: bool,
plot_interval: int,
train_history_path: str):
self.params = params
self.env = env
self.obs_dim = obs_dim
self.act_dim = act_dim
# agent neural networks
self.actor = Actor(
input_dim=self.obs_dim,
output_dim=self.act_dim,
hidden_units=self.params["ACTOR_HU"]
)
self.critic = Critic(
input_dim=self.obs_dim,
hidden_units=self.params["CRITIC_HU"]
)
# neural networks optimizers
self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.params["ACTOR_LR"], epsilon=1e-7)
self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=self.params["CRITIC_LR"], epsilon=1e-7)
# memory buffer
self.agent_mem = Memory()
# logging stuff
self.actor_loss_history = []
self.critic_loss_history = []
self.score_history = []
self.plot_interval = plot_interval
self.train_history_path = train_history_path
self.is_evaluate = is_evaluate
self.score_threshold = score_thresh
def _choose_action(self, state: np.ndarray) -> float:
"""Choose action based on observation. If currently not evaluating, store step info in buffer
:param state: observation returned from the environment
:return: action
"""
state = tf.convert_to_tensor(state, tf.float32)
action, distribution = self.actor(state)
if not self.is_evaluate:
value = self.critic(state)
# store data of current time step of the episode
self.agent_mem.states.append(state)
self.agent_mem.actions.append(action)
self.agent_mem.log_probs.append(distribution.log_prob(action))
self.agent_mem.values.append(value)
return list(action.numpy()).pop()
def _step(self, action: float):
nxt_state, reward, done, info = self.env.step(action)
# add fake dim to match dimensions with batch size
nxt_state = np.reshape(nxt_state, (1, -1)).astype(np.float64)
reward = np.reshape(reward, (1, -1)).astype(np.float64)
done = np.reshape(done, (1, -1))
if not self.is_evaluate:
# convert ndarray returned by step to tf.Tensors and
# store data of current time step of the episode
self.agent_mem.rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
self.agent_mem.dones.append(tf.convert_to_tensor((1 - done), dtype=tf.float32))
return nxt_state, reward, done, info
def train(self):
score = 0
state = self.env.reset()
state = np.reshape(state, (1, -1))
for update in range(self.params["NUM_ITERATIONS"]):
start_time = time.time()
# perform rollout
for _ in range(self.params["EPISODE_STEPS"]):
action = self._choose_action(state)
next_state, reward, done, info = self._step(action)
state = next_state
score += reward[0][0]
if done[0][0]:
self.score_history.append(score)
score = 0
state = self.env.reset()
state = np.reshape(state, (1, -1))
# update plot
if update % self.plot_interval == 0:
self._plot_train_history()
# check if model is good enough to stop training
if np.mean(self.score_history[-self.plot_interval:]) > self.score_threshold:
print(f"Solved before {self.params['NUM_ITERATIONS']}!")
break
# actually train the agent
value = self.critic(tf.convert_to_tensor(next_state, dtype=tf.float32))
self.agent_mem.values.append(value)
self._update_agent()
end_time = time.time()
# print some useful infos
print(f"Update {update}, Total timesteps completed (iteration*T) {self.params['EPISODE_STEPS']}\n \
Last episode score: {self.score_history[update]:.3f}\n \
Last {self.plot_interval} score average: {np.mean(self.score_history[-self.plot_interval:]):.3f}\n \
elapsed update time {end_time - start_time:.2f} seconds\n")
self._save_train_history()
self.env.close()
def _update_agent(self):
actor_losses, critic_losses = [], []
returns = self._get_gae(
self.agent_mem.rewards,
self.agent_mem.values,
self.agent_mem.dones
)
# flatten a list of tf.tensors into vectors
states = tf.reshape(tf.concat(self.agent_mem.states, axis=0), shape=(-1, self.obs_dim)).numpy()
actions = tf.concat(self.agent_mem.actions, axis=0).numpy()
returns = tf.concat(returns, axis=0).numpy()
log_probs = tf.concat(self.agent_mem.log_probs, axis=0).numpy()
values = tf.concat(self.agent_mem.values, axis=0).numpy()
advantages = returns - values[:-1]
# loop through a minibatch and perform gradient computations
for state, action, return_, old_log_prob, old_value, advantage in self._batch_generator(
states,
actions,
returns,
log_probs,
values,
advantages
):
with tf.GradientTape() as tape1:
# compute policy ratio
_, distribution = self.actor(state)
current_log_prob = distribution.log_prob(action)
ratio = tf.exp(current_log_prob - old_log_prob)
clipped_ratio = tf.clip_by_value(ratio, 1 - self.params["CLIP"], 1 + self.params["CLIP"])
# entropy
entropy = tf.reduce_mean(distribution.entropy())
# compute actor loss
surrogate = ratio * advantage
surrogate_ = clipped_ratio * advantage
actor_loss = - tf.reduce_mean(tf.minimum(surrogate, surrogate_)) \
- self.params["ENTROPY_LOSS_COEF"] * entropy
actor_losses.append(actor_loss.numpy()) # numpy() because actor_loss is a tf.Tensor
actor_gradients = tape1.gradient(actor_loss, self.actor.trainable_weights)
self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor.trainable_weights))
with tf.GradientTape() as tape2:
# compute critic loss
current_value = self.critic(state)
critic_loss = tf.reduce_mean(tf.pow(return_ - current_value, 2))
critic_losses.append(critic_loss.numpy()) # numpy() because tf.critic_loss is a tf.Tensor
critic_gradients = tape2.gradient(critic_loss, self.critic.trainable_weights)
self.critic_optimizer.apply_gradients(zip(critic_gradients, self.critic.trainable_weights))
# clean memory of the episode
self.agent_mem.clear_mem()
# update logs
self.actor_loss_history.append(sum(actor_losses) / len(actor_losses))
self.critic_loss_history.append(sum(critic_losses) / len(critic_losses))
def test(self):
self.is_evaluate = True
# load model weights
self.actor.load_weights(self.params["ENV_ID"] + '/' + self.params["EXP_NAME"] + "/actor")
self.critic.load_weights(self.params["ENV_ID"] + '/' + self.params["EXP_NAME"] + "/critic")
### ROLLOUT ###
for _ in range(5): # repeat rollout for 5 times
state = self.env.reset()
state = np.reshape(state, (1, -1))
done = False
while not done:
self.env.render()
action = self._choose_action(state)
next_state, _, done, info = self.env.step(action)
state = next_state
state = np.reshape(state, (1, -1))
self.env.close()
def _plot_train_history(self):
data = [
self.score_history,
self.actor_loss_history,
self.critic_loss_history]
labels = [
f"score: {np.mean(self.score_history[-self.plot_interval:]):.3f}",
f"actor loss: {np.mean(self.actor_loss_history[-self.plot_interval:]):.4f}",
f"critic loss: {np.mean(self.critic_loss_history[-self.plot_interval:]):.4f}"
]
clear_output(True)
with plt.style.context("seaborn-dark-palette"):
fig, axes = plt.subplots(3, 1, figsize=(6, 8))
for i, ax in enumerate(axes):
ax.plot(data[i], c="crimson")
ax.set_title(labels[i])
plt.tight_layout()
plt.show()
def _save_train_history(self):
# save actor and critic weights
self.actor.save_weights(self.params["ENV_ID"] + '/' + self.params["EXP_NAME"] + "/actor")
self.critic.save_weights(self.params["ENV_ID"] + '/' + self.params["EXP_NAME"] + "/critic")
# save losses and score histories as csv files
pd.DataFrame(
{"actor loss": self.actor_loss_history,
"critic_loss": self.critic_loss_history}
).to_csv("loss_logs.csv")
pd.DataFrame(
{"scores": self.score_history}
).to_csv("score_logs.csv")
# save run configuration into a json file
run_config = {
"environment id": self.params["ENV_ID"],
"experiment name": self.params["EXP_NAME"],
"actor_hidden_layers": self.params["ACTOR_HU"],
"critic hidden layers": self.params["CRITIC_HU"],
"actor learning rate": self.params["ACTOR_LR"],
"critic learning rate": self.params["CRITIC_LR"],
"num iterations": self.params["NUM_ITERATIONS"],
"duration of an episode": self.params["EPISODE_STEPS"],
"num train epochs": self.params["NUM_TRAIN_EPOCHS"],
"minibatch size": self.params["MINIBATCH_SIZE"],
"discount factor": self.params["GAMMA"],
"clip parameter": self.params["CLIP"],
"value loss coefficient": self.params["VALUE_LOSS_COEF"],
"entropy loss coefficient": self.params["ENTROPY_LOSS_COEF"],
"gae lambda": self.params["LAMBDA"],
}
j = json.dumps(run_config)
with open("run_config.json", 'w') as f:
f.write(j)
def _get_gae(self, rewards, values, dones):
"""Computes a list of estimators of the advantage function at each timestep of the episode
:param rewards: list of rewards
:param values: list of value function values
:param dones: list of terminal state flags
:return: discounted sum of rewards
"""
gae = 0
returns = []
for i in reversed(range(len(rewards))):
delta = rewards[i] + self.params["GAMMA"] * values[i + 1] * dones[i] - values[i]
gae = delta + self.params["GAMMA"] * self.params["LAMBDA"] * dones[i] * gae
returns.insert(0, gae + values[i]) # we are traversing in reversed order (we keep overwriting)
return returns
def _batch_generator(self,
states,
actions,
returns,
log_probs,
values,
advantages):
"""Generates batch for the training of the agent. Note that the memory must be sampled randomly
"""
data_len = len(states) # we are interested in the first dimension
for _ in range(self.params["NUM_TRAIN_EPOCHS"]):
for _ in range(data_len // self.params["MINIBATCH_SIZE"]):
idxs = np.random.choice(data_len, self.params["MINIBATCH_SIZE"])
yield states[idxs, :], actions[idxs], returns[idxs], log_probs[idxs], values[idxs], advantages[idxs]
# TODO implement _get_succes_rate