-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy path04_frozenlake_nonslippery.py
executable file
·129 lines (107 loc) · 4.25 KB
/
04_frozenlake_nonslippery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
import random
import gym
import gym.spaces
import gym.wrappers
import gym.envs.toy_text.frozen_lake
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import torch.optim as optim
HIDDEN_SIZE = 128
BATCH_SIZE = 100
PERCENTILE = 30
GAMMA = 0.9
class DiscreteOneHotWrapper(gym.ObservationWrapper):
def __init__(self, env):
super(DiscreteOneHotWrapper, self).__init__(env)
assert isinstance(env.observation_space, gym.spaces.Discrete)
self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)
def observation(self, observation):
res = np.copy(self.observation_space.low)
res[observation] = 1.0
return res
class Net(nn.Module):
def __init__(self, obs_size, hidden_size, n_actions):
super(Net, self).__init__()
self.net = nn.Sequential(
nn.Linear(obs_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, n_actions)
)
def forward(self, x):
return self.net(x)
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])
def iterate_batches(env, net, batch_size):
batch = []
episode_reward = 0.0
episode_steps = []
obs = env.reset()
sm = nn.Softmax(dim=1)
while True:
obs_v = torch.FloatTensor([obs])
act_probs_v = sm(net(obs_v))
act_probs = act_probs_v.data.numpy()[0]
action = np.random.choice(len(act_probs), p=act_probs)
next_obs, reward, is_done, _ = env.step(action)
episode_reward += reward
episode_steps.append(EpisodeStep(observation=obs, action=action))
if is_done:
batch.append(Episode(reward=episode_reward, steps=episode_steps))
episode_reward = 0.0
episode_steps = []
next_obs = env.reset()
if len(batch) == batch_size:
yield batch
batch = []
obs = next_obs
def filter_batch(batch, percentile):
disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch))
reward_bound = np.percentile(disc_rewards, percentile)
train_obs = []
train_act = []
elite_batch = []
for example, discounted_reward in zip(batch, disc_rewards):
if discounted_reward > reward_bound:
train_obs.extend(map(lambda step: step.observation, example.steps))
train_act.extend(map(lambda step: step.action, example.steps))
elite_batch.append(example)
return elite_batch, train_obs, train_act, reward_bound
if __name__ == "__main__":
random.seed(12345)
env = gym.envs.toy_text.frozen_lake.FrozenLakeEnv(is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps=100)
env = DiscreteOneHotWrapper(env)
# env = gym.wrappers.Monitor(env, directory="mon", force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)
writer = SummaryWriter(comment="-frozenlake-nonslippery")
full_batch = []
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE)
if not full_batch:
continue
obs_v = torch.FloatTensor(obs)
acts_v = torch.LongTensor(acts)
full_batch = full_batch[-500:]
optimizer.zero_grad()
action_scores_v = net(obs_v)
loss_v = objective(action_scores_v, acts_v)
loss_v.backward()
optimizer.step()
print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d" % (
iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch)))
writer.add_scalar("loss", loss_v.item(), iter_no)
writer.add_scalar("reward_mean", reward_mean, iter_no)
writer.add_scalar("reward_bound", reward_bound, iter_no)
if reward_mean > 0.8:
print("Solved!")
break
writer.close()