19
19
20
20
USE_CUDA = torch .cuda .is_available ()
21
21
dtype = torch .cuda .FloatTensor if torch .cuda .is_available () else torch .FloatTensor
22
+
22
23
class Variable (autograd .Variable ):
23
24
def __init__ (self , data , * args , ** kwargs ):
24
25
if USE_CUDA :
@@ -27,6 +28,16 @@ def __init__(self, data, *args, **kwargs):
27
28
28
29
OptimizerSpec = namedtuple ("OptimizerSpec" , ["constructor" , "kwargs" , "lr_schedule" ])
29
30
31
+ # Check if the parameters of the model update accordingly.
32
+ def check_norm (model ):
33
+ total_norm = 0.0
34
+ for p in model .parameters ():
35
+ param_norm = p .grad .data .norm (2.0 )
36
+ total_norm += param_norm ** 2.0
37
+ total_norm = total_norm ** (1.0 / 2.0 )
38
+ return total_norm
39
+
40
+
30
41
def dqn_learing (
31
42
env ,
32
43
q_func ,
@@ -94,10 +105,10 @@ def dqn_learing(
94
105
95
106
if len (env .observation_space .shape ) == 1 :
96
107
# This means we are running on low-dimensional observations (e.g. RAM)
97
- input_shape = env .observation_space .shape
108
+ input_arg = env .observation_space .shape [ 0 ]
98
109
else :
99
110
img_h , img_w , img_c = env .observation_space .shape
100
- input_shape = ( img_h , img_w , frame_history_len * img_c )
111
+ input_arg = frame_history_len * img_c
101
112
num_actions = env .action_space .n
102
113
103
114
# Construct an epilson greedy policy with given exploration schedule
@@ -106,14 +117,14 @@ def select_epilson_greedy_action(model, obs, t):
106
117
eps_threshold = exploration .value (t )
107
118
if sample > eps_threshold :
108
119
obs = torch .from_numpy (obs ).type (dtype ).unsqueeze (0 ) / 255.0
109
- # Detach variable from the current graph since we don't want gradients to propagated
110
- return model (Variable (obs )). detach ( ).data .max (1 )[1 ].cpu ()
120
+ # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
121
+ return model (Variable (obs , volatile = True ) ).data .max (1 )[1 ].cpu ()
111
122
else :
112
123
return torch .IntTensor ([[random .randrange (num_actions )]])
113
124
114
125
# Initialize target q function and q function
115
- Q = q_func (input_shape [ 2 ] , num_actions ).type (dtype )
116
- target_Q = q_func (input_shape [ 2 ] , num_actions ).type (dtype )
126
+ Q = q_func (input_arg , num_actions ).type (dtype )
127
+ target_Q = q_func (input_arg , num_actions ).type (dtype )
117
128
118
129
# Construct optimizer with adaptive learning rate
119
130
# https://discuss.pytorch.org/t/adaptive-learning-rate/320
@@ -148,6 +159,7 @@ def construct_optimizer(t):
148
159
# previous frames.
149
160
# recent_observations: shape(img_h, img_w, frame_history_len) are input to to the model
150
161
recent_observations = replay_buffer .encode_recent_observation ().transpose (2 , 0 , 1 )
162
+ # recent_observations = replay_buffer.encode_recent_observation()
151
163
152
164
# Choose random action if not yet start learning
153
165
if t > learning_starts :
@@ -176,9 +188,11 @@ def construct_optimizer(t):
176
188
obs_batch , act_batch , rew_batch , next_obs_batch , done_mask = replay_buffer .sample (batch_size )
177
189
# Convert numpy nd_array to torch variables for calculation
178
190
obs_batch = Variable (torch .from_numpy (obs_batch .transpose (0 , 3 , 1 , 2 )).type (dtype ) / 255.0 )
191
+ # obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
179
192
act_batch = Variable (torch .from_numpy (act_batch ).long ())
180
193
rew_batch = Variable (torch .from_numpy (rew_batch ))
181
194
next_obs_batch = Variable (torch .from_numpy (next_obs_batch .transpose (0 , 3 , 1 , 2 )).type (dtype ) / 255.0 , volatile = True )
195
+ # next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0, volatile=True)
182
196
done_mask = torch .from_numpy (done_mask )
183
197
184
198
if USE_CUDA :
@@ -190,22 +204,36 @@ def construct_optimizer(t):
190
204
# We choose Q based on action taken.
191
205
current_Q_values = Q (obs_batch ).gather (1 , act_batch .unsqueeze (1 ))
192
206
# Compute next Q value, based on which acion gives max Q values
193
- next_max_Q_values = Variable (torch .zeros (batch_size ))
194
- next_max_Q_values [done_mask == 0 ] = target_Q (next_obs_batch ).max (1 )[0 ]
207
+ next_max_Q_values = Variable (torch .zeros (batch_size ).type (dtype ))
208
+ # # Detach variable from the current graph since we don't want gradients to propagated
209
+ next_max_Q_values [done_mask == 0 ] = target_Q (next_obs_batch ).detach ().max (1 )[0 ]
195
210
# Compute Bellman error, use huber loss to mitigate outlier impact
196
211
bellman_error = F .smooth_l1_loss (current_Q_values , rew_batch + (gamma * next_max_Q_values ))
197
212
# Run backward pass and clip the gradient
213
+ Q .zero_grad ()
198
214
bellman_error .backward ()
199
- nn .utils .clip_grad_norm (Q .parameters (), grad_norm_clipping )
215
+
216
+ if check_norm (Q ) > grad_norm_clipping :
217
+ print ('Before clipping gradient:' )
218
+ print ('total_norm: ' , check_norm (Q ))
219
+ nn .utils .clip_grad_norm (Q .parameters (), grad_norm_clipping )
220
+ print ('After clipping gradient:' )
221
+ print ('total_norm: ' , check_norm (Q ))
200
222
# Perfom the update
201
223
optimizer = construct_optimizer (t )
202
224
optimizer .step ()
225
+ # print('After update Q:')
226
+ # check_norm(Q)
203
227
num_param_updates += 1
204
228
205
229
# Periodically update the target network by Q network to target Q network
206
230
if num_param_updates % target_update_freq == 0 :
231
+ # print('Before update target:')
232
+ # check_norm(target_Q)
207
233
for target_param , param in zip (target_Q .parameters (), Q .parameters ()):
208
234
target_param .data = param .data .clone ()
235
+ # print('After update target:')
236
+ # check_norm(target_Q)
209
237
210
238
### 4. Log progress
211
239
episode_rewards = get_wrapper_by_name (env , "Monitor" ).get_episode_rewards ()
0 commit comments