-
Notifications
You must be signed in to change notification settings - Fork 0
/
double dqn.py
135 lines (106 loc) · 4.14 KB
/
double dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from utils import *
#Hyperparameters
ACTIONS_DIM = 2
OBSERVATIONS_DIM = 4
MAX_ITERATIONS = 10**6
LEARNING_RATE = 1e-2
NUM_EPOCHS = 50
GAMMA = 0.99
REPLAY_MEMORY_SIZE = 1000
NUM_EPISODES = 10000
TARGET_UPDATE_FREQ = 100
MINIBATCH_SIZE = 100
RANDOM_ACTION_DECAY = 0.99
INITIAL_RANDOM_ACTION = 1
#We'll be using this array for plotting our rewards
reward_progress = []
#calculates model(observation)
def get_out_tensor(model, observation):
np_obs = np.reshape(observation, [-1, OBSERVATIONS_DIM])
return model(torch.from_numpy(np_obs).float())
#trains model
def train(model, observations, targets, criterion, optimizer):
optimizer.zero_grad()
out_tensor = get_out_tensor(model, observations)
loss = criterion(out_tensor, torch.tensor(targets))
loss.backward()
optimizer.step()
#network model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model = nn.Sequential(
nn.Linear(OBSERVATIONS_DIM, 16),
nn.ReLU(),
nn.Linear(16, 16),
nn.ReLU(),
nn.Linear(16, ACTIONS_DIM)
)
def forward(self, x):
return self.model(x)
#trains model by taking sample inputs
def update_action(action_model, target_model, sample_transitions, criterion, optimizer):
random.shuffle(sample_transitions)
batch_observations = []
batch_targets = []
for sample_transition in sample_transitions:
old_observation, action, reward, observation = sample_transition
targets = np.reshape(get_out_tensor(action_model, old_observation).detach().numpy(), ACTIONS_DIM)
targets[action] = reward
if observation is not None:
predictions = get_out_tensor(target_model, observation).detach().numpy()
new_action = np.argmax(predictions)
targets[action] += GAMMA * predictions[0, new_action]
batch_observations.append(old_observation)
batch_targets.append(targets)
train(action_model, batch_observations, batch_targets, criterion, optimizer)
def main():
random_action_probability = INITIAL_RANDOM_ACTION
#replay mempry
replay = ReplayBuffer(REPLAY_MEMORY_SIZE)
#model, target_model, loss function, optimizer
model = Net()
target_model = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
#creating environment
env = gym.make('CartPole-v0')
#collects reward to plot it later
total_reward = 0
for episode in range(NUM_EPISODES):
#reset env
observation = env.reset()
#collect experiences
for iteration in range(MAX_ITERATIONS):
random_action_probability *= RANDOM_ACTION_DECAY
random_action_probability = max(random_action_probability, 0.1)
old_observation = observation
if np.random.random() < random_action_probability:
action = np.random.choice(ACTIONS_DIM)
else:
action = get_out_tensor(model, observation).detach().numpy()
action = np.argmax(action)
observation, reward, done, info = env.step(action)
total_reward += reward
if done:
print("Episode", episode, "Score:", total_reward)
reward_progress.append(total_reward)
plotProgress(reward_progress)
reward = -200
total_reward = 0
replay.add(old_observation, action, reward, None)
break
replay.add(old_observation, action, reward, observation)
#if we have enough experiences, train the model
if replay.size() >= MINIBATCH_SIZE:
sample_transitions = replay.sample(MINIBATCH_SIZE)
target_model.load_state_dict(model.state_dict())
update_action(model, target_model, sample_transitions, criterion, optimizer)
if __name__ == '__main__':
main()