-
Notifications
You must be signed in to change notification settings - Fork 0
/
cliff_walk_sarsa.py
81 lines (73 loc) · 2.35 KB
/
cliff_walk_sarsa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding:utf-8 -*-
# Train Sarsa in cliff-walking environment
import math, os, time, sys
import numpy as np
import random
import gym
from agent import SarsaAgent
##### START CODING HERE #####
# This code block is optional. You can import other libraries or define your utility functions if necessary.
from torch.utils.tensorboard import SummaryWriter
lr = 0.1
gamma = 0.9
epsilon = 1
e_decay = 0.008
writer = SummaryWriter(log_dir='./sarsaruns/lr={}-gamma={}-epsilon={}-e_decay={}'.format(lr,gamma,epsilon,e_decay))
def begin_video(episode):
if episode >= 950:
return True
else:
return False
##### END CODING HERE #####
# construct the environment
env = gym.make("CliffWalking-v0")
env = gym.wrappers.RecordVideo(env, './video', episode_trigger=begin_video)
# get the size of action space
num_actions = env.action_space.n
all_actions = np.arange(num_actions)
# set random seed and make the result reproducible
RANDOM_SEED = 0
env.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
####### START CODING HERE #######
# construct the intelligent agent.
agent = SarsaAgent(all_actions, lr, gamma, epsilon, e_decay)
# start training
for episode in range(1000):
# record the reward in an episode
episode_reward = 0
# reset env
s = env.reset()
# render env. You can remove all render() to turn off the GUI to accelerate training.
#env.render()
# choose first action
a = agent.choose_action(s)
# agent interacts with the environment
for it in range(500):
# take action
s_, r, isdone, info = env.step(a)
# if episode == 999:
# env.render()
#env.render()
# choose the next action
a_ = agent.choose_action(s_)
# update the episode reward
episode_reward += r
#print(f"{s} {a} {s_} {r} {isdone}")
# agent learns from experience
agent.learn(s, a, s_, a_, r)
s = s_
a = a_
if isdone:
time.sleep(0.1)
break
print('episode:', episode, 'episode_reward:', episode_reward, 'epsilon:', agent.epsilon)
agent.epsilon_decay()
writer.add_scalar('episode reward', episode_reward, episode)
writer.add_scalar('epsilon', agent.epsilon, episode)
print('\ntraining over\n')
# close the render window after training.
env.close()
writer.close()
####### END CODING HERE #######