-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolicy_search.py
57 lines (47 loc) · 1.92 KB
/
policy_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
from task import Task
class PolicySearch_Agent():
def __init__(self, task):
# Task (environment) information
self.task = task
self.state_size = task.state_size
self.action_size = task.action_size
self.action_low = task.action_low
self.action_high = task.action_high
self.action_range = self.action_high - self.action_low
self.w = np.random.normal(
size=(self.state_size, self.action_size), # weights for simple linear policy: state_space x action_space
scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range
# Score tracker and learning parameters
self.best_w = None
self.best_score = -np.inf
self.noise_scale = 0.1
# Episode variables
self.reset_episode()
def reset_episode(self):
self.total_reward = 0.0
self.count = 0
state = self.task.reset()
return state
def step(self, reward, done):
# Save experience / reward
self.total_reward += reward
self.count += 1
# Learn, if at end of episode
if done:
self.learn()
def act(self, state):
# Choose action based on given state and policy
action = np.dot(state, self.w) # simple linear policy
return action
def learn(self):
# Learn by random policy search, using a reward-based score
self.score = self.total_reward / float(self.count) if self.count else 0.0
if self.score > self.best_score:
self.best_score = self.score
self.best_w = self.w
self.noise_scale = max(0.5 * self.noise_scale, 0.01)
else:
self.w = self.best_w
self.noise_scale = min(2.0 * self.noise_scale, 3.2)
self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape) # equal noise in all directions