-
Notifications
You must be signed in to change notification settings - Fork 1
/
agent.py
345 lines (270 loc) · 14.1 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import numpy as np
# our helpers
from navigation.dqn.utils import replaybuffer
from navigation.dqn.utils import prioritybuffer
# debugging helpers
from IPython.core.debugger import set_trace
class IDqnAgent( object ) :
def __init__( self, agentConfig, modelConfig, modelBuilder, backendInitializer ) :
"""Constructs a generic Dqn agent, given configuration information
Args:
agentConfig (DqnAgentConfig) : config object with agent parameters
modelConfig (DqnModelConfig) : config object with model parameters
modelBuilder (function) : factory function to instantiate the model
backendInitializer (function) : function to be called to intialize specifics of each DL library
"""
super( IDqnAgent, self ).__init__()
# environment state and action spaces info
self._stateDim = agentConfig.stateDim
self._nActions = agentConfig.nActions
# random seed
self._seed = agentConfig.seed
np.random.seed( self._seed )
# parameters for linear schedule of eps
self._epsStart = agentConfig.epsilonStart
self._epsEnd = agentConfig.epsilonEnd
self._epsSteps = agentConfig.epsilonSteps
self._epsDecay = agentConfig.epsilonDecay
self._epsSchedule = agentConfig.epsilonSchedule
self._epsilon = self._epsStart
# learning rate and related parameters
self._lr = agentConfig.lr
self._minibatchSize = agentConfig.minibatchSize
self._learningStartsAt = agentConfig.learningStartsAt
self._learningUpdateFreq = agentConfig.learningUpdateFreq
self._learningUpdateTargetFreq = agentConfig.learningUpdateTargetFreq
self._learningMaxSteps = agentConfig.learningMaxSteps
# size of replay buffer
self._replayBufferSize = agentConfig.replayBufferSize
# discount factor gamma
self._gamma = agentConfig.discount
# tau factor for soft-updates
self._tau = agentConfig.tau
# some counters used by the agent's logic
self._istep = 0
self._iepisode = 0
# whether or not using a convolutional-based model
self._useConvolutionalBasedModel = agentConfig.useConvolutionalBasedModel
# improvements to dqn
self._useDoubleDqn = agentConfig.useDoubleDqn
self._usePrioritizedExpReplay = agentConfig.usePrioritizedExpReplay
self._useDuelingDqn = agentConfig.useDuelingDqn
# copy some parameters from the agent config into the model config
modelConfig._lr = self._lr
modelConfig._useImpSampling = self._usePrioritizedExpReplay
modelConfig._useConvolutionalBasedModel = self._useConvolutionalBasedModel
# create the model accordingly
self._qmodel_actor = modelBuilder( 'actor_model', modelConfig, True )
self._qmodel_target = modelBuilder( 'target_model', modelConfig, False )
# initialize backend-specific functionality
_initInfo = backendInitializer()
# create both actor and target models
self._qmodel_actor.initialize( _initInfo )
self._qmodel_target.initialize( _initInfo )
# start the target model from the actor model
self._qmodel_target.clone( self._qmodel_actor, tau = 1.0 )
# replay buffer
if self._usePrioritizedExpReplay :
self._rbuffer = prioritybuffer.DqnPriorityBuffer( self._replayBufferSize,
self._seed )
else :
self._rbuffer = replaybuffer.DqnReplayBuffer( self._replayBufferSize,
self._seed )
# states (current and next) for the model representation
self._currState = None
self._nextState = None
# agent actions descriptions
self._actionsDescs = ['???'] * self._nActions
self._printConfig();
def save( self, filename ) :
"""Saves learned models into disk
Args:
filename (str) : filepath where we want to save the our model
"""
if self._qmodel_actor :
self._qmodel_actor.save( filename )
def load( self, filename ) :
"""Loads a trained model from disk
Args:
filename (str) : filepath where we want to load our model from
"""
if self._qmodel_actor :
self._qmodel_actor.load( filename )
self._qmodel_target.clone( self._qmodel_actor, tau = 1.0 )
def act( self, state, inference = False ) :
"""Returns an action to take from the given state
Args:
state (object) : state|observation coming from the simulator
inference (bool) : whether or not we are in inference mode
Returns:
int : action to take (assuming discrete actions space)
"""
if inference or np.random.rand() > self._epsilon :
if self._useConvolutionalBasedModel :
_processedState = self._preprocess( state )
return np.argmax( self._qmodel_actor.eval( _processedState.reshape( (1,) + _processedState.shape ) ) )
else :
return np.argmax( self._qmodel_actor.eval( self._preprocess( state ) ) )
else :
return np.random.choice( self._nActions )
def step( self, transition ) :
"""Does one step of the learning algorithm, from Mnih et. al.
https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
"""
# grab information from this transition
_s, _a, _snext, _r, _done = transition
# preprocess the raw state
self._nextState = self._preprocess( _snext )
if self._currState is None :
self._currState = self._preprocess( _s ) # for first step
# store in replay buffer
self._rbuffer.add( self._currState, _a, self._nextState, _r, _done )
# check if can do a training step
if self._istep > self._learningStartsAt and \
self._istep % self._learningUpdateFreq == 0 and \
len( self._rbuffer ) >= self._minibatchSize :
self._learn()
# update the parameters of the target model (every update_target steps)
if self._istep > self._learningStartsAt and \
self._istep % self._learningUpdateTargetFreq == 0 :
self._qmodel_target.clone( self._qmodel_actor, tau = self._tau )
# save next state (where we currently are in the environment) as current
self._currState = self._nextState
# update the agent's step counter
self._istep += 1
# and the episode counter if we finished an episode, and ...
# the states as well (I had a bug here, becasue I didn't ...
# reset the states).
if _done :
self._iepisode += 1
self._currState = None
self._nextState = None
# check epsilon update schedule and update accordingly
if self._epsSchedule == 'linear' :
# update epsilon using linear schedule
_epsFactor = 1. - ( max( 0, self._istep - self._learningStartsAt ) / self._epsSteps )
_epsDelta = max( 0, ( self._epsStart - self._epsEnd ) * _epsFactor )
self._epsilon = self._epsEnd + _epsDelta
elif self._epsSchedule == 'geometric' :
if _done :
# update epsilon with a geometric decay given by a decay factor
_epsFactor = self._epsDecay if self._istep >= self._learningStartsAt else 1.0
self._epsilon = max( self._epsEnd, self._epsilon * _epsFactor )
def _preprocess( self, rawState ) :
"""Preprocess a raw state into an appropriate state representation
Args:
rawState (np.ndarray) : raw state to be transformed
Returns:
np.ndarray : preprocess state into the approrpiate representation
"""
""" OVERRIDE this method with your specific preprocessing """
raise NotImplementedError( 'IDqnAgent::_preprocess> virtual method' )
def _learn( self ) :
"""Makes a learning step using the DQN algorithm from Mnih et. al.
https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
"""
# get a minibatch from the replay buffer
_minibatch = self._rbuffer.sample( self._minibatchSize )
if self._usePrioritizedExpReplay :
_states, _actions, _nextStates, _rewards, _dones, _indices, _impSampWeights = _minibatch
else :
_states, _actions, _nextStates, _rewards, _dones = _minibatch
# compute targets (in a vectorized way). Recall:
#
# |-> _reward if s' is a terminal
# q-target = |
# |-> _reward + gamma * max( Q(s',a') ) otherwise
# a'
# Or in vectorized form ( recall Q(s') computes all qvalues ) :
#
# qtargets = _rewards + (1 - terminals) * gamma * max(Q(nextStates), batchAxis)
#
# Notes (for nnetwork models):
# * Just to clarify, we are assuming that in this call to Q
# the targets generated are not dependent of the weights
# of the network (should not take into consideration gradients
# here, nor take them as part of the computation graph).
# Basically the targets are like training data from a 'dataset'.
_qtargets = None
if self._useDoubleDqn :
# targets are computed in the following way
#
#
# q-target = r + gamma * Q( s', argmax( Q(s',a';theta) ); theta )
# ^ a' actor target
# |
# | ^
# qvalue from target model |
# |
# greedy action from actor model
# compute qvalues from both actorModel and targetModel
_qvals_actorModel_s = self._qmodel_actor.eval( _nextStates )
_qvals_targetModel_s = self._qmodel_target.eval( _nextStates )
# greedy actions
_argmaxActions = np.argmax( _qvals_actorModel_s, 1 ).reshape( -1, 1 )
# compute qtargets from the qvals of target network, ...
# using greedy actions from the actor network
_qtargets = _rewards + ( 1 - _dones ) * self._gamma * \
np.squeeze( np.take_along_axis( _qvals_targetModel_s, _argmaxActions, axis = 1 ) )
## set_trace()
else :
# targets are just computing the target network
_qtargets = _rewards + ( 1 - _dones ) * self._gamma * \
np.max( self._qmodel_target.eval( _nextStates ), 1 )
_qtargets = _qtargets.astype( np.float32 )
# make the learning call to the model (kind of like supervised setting)
if self._usePrioritizedExpReplay :
## if np.sum( _rewards ) > 0. :
## set_trace()
# train using also importance sampling weights
_absBellmanErrors = self._qmodel_actor.train( _states, _actions, _qtargets, _impSampWeights )
# and update the priorities using the new bellman erros
self._rbuffer.updatePriorities( _indices, _absBellmanErrors )
else :
# train using the normal data required
self._qmodel_actor.train( _states, _actions, _qtargets )
@property
def epsilon( self ) :
return self._epsilon
@property
def seed( self ) :
return self._seed
@property
def learningMaxSteps( self ) :
return self._learningMaxSteps
@property
def actorModel( self ) :
return self._qmodel_actor
@property
def targetModel( self ) :
return self._qmodel_target
@property
def replayBuffer( self ) :
return self._rbuffer
@property
def actionsDescs( self ) :
return self._actionsDescs
def _printConfig( self ) :
print( '#############################################################' )
print( '# #' )
print( '# Agent configuration #' )
print( '# #' )
print( '#############################################################' )
print( 'state space dimension : ', self._stateDim )
print( 'number of actions : ', self._nActions )
print( 'seed : ', self._seed )
print( 'epsilon start value : ', self._epsStart )
print( 'epsilon end value : ', self._epsEnd )
print( 'epsilon schedule type : ', self._epsSchedule )
print( 'epsilon linear decay steps : ', self._epsSteps )
print( 'epsilon geom. decay factor : ', self._epsDecay )
print( 'learning rate : ', self._lr )
print( 'minibatch size : ', self._minibatchSize )
print( 'learning starting step for training : ', self._learningStartsAt )
print( 'learning updateFreq (training actor-model) : ', self._learningUpdateFreq )
print( 'learning updateTargetFreq (target-model) : ', self._learningUpdateTargetFreq )
print( 'learning max steps : ', self._learningMaxSteps )
print( 'replay buffer size : ', self._replayBufferSize )
print( 'gamma (discount factor) : ', self._gamma )
print( 'tau (target model soft-updates) : ', self._tau )
print( '#############################################################' )