-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_best_A2C.py
executable file
·90 lines (73 loc) · 2.52 KB
/
train_best_A2C.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jan 2 16:03:09 2021
@author: tennismichel
"""
# import os
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
# from networks import NNetwork, NeuralNetworkPolicy
from agents import Q_Agent, Q_DQN_Agent, SARSA_Agent, SARSA_DQN_Agent
from agents_nnp import A2C_Agent, MC_PolGrad_Agent
#%% ENVIRONMENT
env = gym.make('Acrobot-v1')
# env = gym.make('MountainCar-v0')
#%% SEED
random_seed = 1234
np.random.seed(random_seed)
torch.manual_seed(random_seed)
env.seed(random_seed)
#%% TRAINING
training_results = list() # A list for storing the hyperparameters and the corresponding results
MAX_EPISODES = 4000
MAX_STEPS = 500
LOG_INTERVAL = 100
LR_POL = 0.001
GAMMA = 0.99
DROPOUT = 0
HIDDEN_DIM = 512
#%% Train Q_DQN-Agent (semi-gradient) ###
agent_results = list()
hyperparam_dict = {'name': 'A2C (DROPOUT:0, NN$_{dim}$:512)'}
agent = A2C_Agent(env, num_episodes=MAX_EPISODES, num_steps=MAX_STEPS, learning_rate=LR_POL,
gamma=GAMMA, hidden_dim=HIDDEN_DIM, dropout=DROPOUT, log_interval=LOG_INTERVAL)
ep_rewards, running_rewards = agent.train()
# agent_results.append(running_rewards)
# agent_results = np.array(agent_results)
# rewards_mu = agent_results.mean(axis=0)
# rewards_sigma = agent_results.std(axis=0)
training_results.append((hyperparam_dict, ep_rewards, running_rewards))
#%% VISUALIZATION
plt.rcParams.update({'font.size': 9})
width = 170/25.4 # 6.7in
# FIGSIZE = (width,width*1/3)
FIGSIZE = (width,width*3/8)
# Plot the results
fig, ax = plt.subplots(figsize=FIGSIZE)
i=1
for result in training_results:
# i += 1
# if not i%2==0:
# continue
hp = result[0]
ep_reward = result[1]
running_reward = result[2]
plt.plot(range(len(ep_reward)), ep_reward, lw=1.2, label='A2C (DROPOUT: 0, NN$_{dim}$: 512)')
plt.plot(range(len(running_reward)), running_reward, lw=1.2, label='EMA (stopped @ -75)')
# ax.fill_between(range(len(mu)), mu+sigma, mu-sigma, alpha=0.5)
# plt.plot(range(len(ep_rewards)), ep_rewards, lw=2, color="red", label=hp['name'])
# title_str = "Acrobot-v1 ($hiddenDim_{qnet}$: " + str(HIDDEN_DIM_QNET) + ", $hiddenDim_{pol}$: " + str(HIDDEN_DIM_POL) + ")"
# plt.title(title_str)
ax.set_xlim(0,2500)
plt.grid()
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.legend(loc='lower right', ncol=1) # ncol=1
fig.tight_layout()
plt.show()
fig.savefig('images/A2C_agent_232.pdf')
#%% Save neural network of agent
agent.save_pol_network(save_dir="models", file_name="A2C_v2.pt")