-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAgent.py
104 lines (94 loc) · 5.65 KB
/
Agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
import torch
from networks import ActorCritic,LinearModel
import os
class Agent(torch.nn.Module):
def __init__(self,num_actions,learning_rate,rollout_horizon=2048,entropy_c2=0.01,vf_c1=1 , gae_lambda=0.95,discount_gamma=0.99,epsilon_clip=0.2, batch_size=32,n_epochs=5):
super(Agent,self).__init__()
self.use_cuda = torch.cuda.is_available() # Autodetect CUDA
self.device = torch.device("cuda" if self.use_cuda else "cpu")
print('Device:', self.device)
# Create the whole model for both the actor and the critic
self.model = ActorCritic(num_actions).to(self.device) # save the model, Tensor.to(device) Moves and/or casts the parameters and buffers.
"=========HYPERPARAMETERS========="
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate) # Implements Adam algorithm.
self.batch_size=batch_size
self.entropy_c2=entropy_c2
self.vf_c1=vf_c1
self.n_epochs=n_epochs
self.rollout_horizon=rollout_horizon
self.gae_lambda=gae_lambda
self.discount_gamma=discount_gamma
self.epsilon_clip=epsilon_clip
self.learning_rate=learning_rate
# Initialize agent memory
self.observations,self.values,self.rewards,self.dones,self.log_probs,self.actions=[],[],[],[],[],[]
#create a state dict for saving the model
self.state_dict = self.model.state_dict()
def learn(self):
"""Where the agent learns from the rollouts, specifically calculating
generalized advantage estimation, loss functions for policy, value and entropy loss, """
gae = 0
returns = []
"""This is for calculating the advantage which is the accumulation of rewards
and state values according to the equations in the PPO paper: https://arxiv.org/pdf/1707.06347.pdf
-> At = δt + (γλ)δt+1 + · · · """
for step in reversed(range(len(self.rewards))): #calculate returns from the end
delta = self.rewards[step] + self.discount_gamma * self.values[step + 1] * (1-self.dones[step]) - self.values[step]
gae = delta + self.discount_gamma * self.gae_lambda * (1-self.dones[step]) * gae
returns.insert(0, gae + self.values[step])
self.values=self.values[:-2]
returns = torch.cat(returns).detach() # concatenates along existing dimension and detach the tensor from the network graph, making the tensor no gradient
self.log_probs = torch.cat(self.log_probs).detach()
self.values = torch.cat(self.values).detach()
self.observations = torch.cat(self.observations)
self.actions = torch.cat(self.actions)
advantages = returns - self.values # Advantage Estimation
dataset = torch.utils.data.TensorDataset(self.log_probs, self.observations, self.actions,returns, advantages)
for _ in range(self.n_epochs):
data_loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
for minibatch in data_loader:
log_probs_batch, observations_batch, actions_batch,returns_batch,advantages_batch=minibatch
#normalize advantages on batch level
advantages_batch-=advantages_batch.mean()
advantages_batch/=(advantages_batch.std()+1e-8)
dist, values = self.model(observations_batch)
#Entropy of the action probability distribution
entropy = dist.entropy().mean()
actions_batch = actions_batch.reshape(1, len(actions_batch)) # take the relative action and take the column
new_log_probs = dist.log_prob(actions_batch)
new_log_probs = new_log_probs.reshape(new_log_probs.shape[1],1)
ratio = (new_log_probs - log_probs_batch).exp() # new_prob/old_prob
surr1 = ratio * advantages_batch
surr2 = torch.clamp(ratio, 1.0 - self.epsilon_clip, 1.0 + self.epsilon_clip) * advantages_batch
actor_loss = - torch.min(surr1, surr2).mean()
critic_loss = (returns_batch - values).pow(2).mean()
loss = self.vf_c1 * critic_loss + actor_loss - self.entropy_c2 * entropy
self.optimizer.zero_grad() # in PyTorch, we need to set the gradients to zero before applying gradient steps.
loss.backward()
self.optimizer.step()
self.clear_memory()
def store_rollout(self,observation,action,log_prob,value,reward,done):
"""Store the rollout of the episodes"""
self.observations.append(observation)
self.actions.append(action)
self.log_probs.append(log_prob)
self.values.append(value)
self.rewards.append(reward)
self.dones.append(done)
def save_models(self,episode):
# Create the directory if it doesn't exist
file_path=f'./checkpoints/actor-critic/{episode}/model.pth'
if not os.path.exists(os.path.dirname(file_path)):
os.makedirs(os.path.dirname(file_path))
"""A method for saving the actor critic models checkpoints"""
# Save the state_dict to a file
torch.save(self.state_dict, file_path)
def restore_models(self,episode):
"""A method for loading the models"""
state_dict = torch.load(f'./checkpoints/actor-critic/{episode}/model.pth')
# Load the state_dict into the model
self.model.load_state_dict(state_dict)
def clear_memory(self):
"""Clear the memory after one episode of on-policy updates"""
self.observations,self.values,self.rewards,self.dones,self.log_probs,self.actions=[],[],[],[],[],[]