Add ram for test, not done yet

transedward · transedward · commit e7e34c98684d · 2017-03-06T22:01:06.000+08:00
diff --git a/dqn_learn.py b/dqn_learn.py
@@ -19,6 +19,7 @@
 
 USE_CUDA = torch.cuda.is_available()
 dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
+
 class Variable(autograd.Variable):
     def __init__(self, data, *args, **kwargs):
         if USE_CUDA:
@@ -27,6 +28,16 @@ def __init__(self, data, *args, **kwargs):
 
 OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
 
+# Check if the parameters of the model update accordingly.
+def check_norm(model):
+    total_norm = 0.0
+    for p in model.parameters():
+        param_norm = p.grad.data.norm(2.0)
+        total_norm += param_norm ** 2.0
+    total_norm = total_norm ** (1.0 / 2.0)
+    return total_norm
+
+
 def dqn_learing(
     env,
     q_func,
@@ -94,10 +105,10 @@ def dqn_learing(
 
     if len(env.observation_space.shape) == 1:
         # This means we are running on low-dimensional observations (e.g. RAM)
-        input_shape = env.observation_space.shape
+        input_arg = env.observation_space.shape[0]
     else:
         img_h, img_w, img_c = env.observation_space.shape
-        input_shape = (img_h, img_w, frame_history_len * img_c)
+        input_arg = frame_history_len * img_c
     num_actions = env.action_space.n
 
     # Construct an epilson greedy policy with given exploration schedule
@@ -106,14 +117,14 @@ def select_epilson_greedy_action(model, obs, t):
         eps_threshold = exploration.value(t)
         if sample > eps_threshold:
             obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
-            # Detach variable from the current graph since we don't want gradients to propagated
-            return model(Variable(obs)).detach().data.max(1)[1].cpu()
+            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
+            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
         else:
             return torch.IntTensor([[random.randrange(num_actions)]])
 
     # Initialize target q function and q function
-    Q = q_func(input_shape[2], num_actions).type(dtype)
-    target_Q = q_func(input_shape[2], num_actions).type(dtype)
+    Q = q_func(input_arg, num_actions).type(dtype)
+    target_Q = q_func(input_arg, num_actions).type(dtype)
 
     # Construct optimizer with adaptive learning rate
     # https://discuss.pytorch.org/t/adaptive-learning-rate/320
@@ -148,6 +159,7 @@ def construct_optimizer(t):
         # previous frames.
         # recent_observations: shape(img_h, img_w, frame_history_len) are input to to the model
         recent_observations = replay_buffer.encode_recent_observation().transpose(2, 0, 1)
+        # recent_observations = replay_buffer.encode_recent_observation()
 
         # Choose random action if not yet start learning
         if t > learning_starts:
@@ -176,9 +188,11 @@ def construct_optimizer(t):
             obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)
             # Convert numpy nd_array to torch variables for calculation
             obs_batch = Variable(torch.from_numpy(obs_batch.transpose(0, 3, 1, 2)).type(dtype) / 255.0)
+            # obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
             act_batch = Variable(torch.from_numpy(act_batch).long())
             rew_batch = Variable(torch.from_numpy(rew_batch))
             next_obs_batch = Variable(torch.from_numpy(next_obs_batch.transpose(0, 3, 1, 2)).type(dtype) / 255.0, volatile=True)
+            # next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0, volatile=True)
             done_mask = torch.from_numpy(done_mask)
 
             if USE_CUDA:
@@ -190,22 +204,36 @@ def construct_optimizer(t):
             # We choose Q based on action taken.
             current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))
             # Compute next Q value, based on which acion gives max Q values
-            next_max_Q_values = Variable(torch.zeros(batch_size))
-            next_max_Q_values[done_mask == 0] = target_Q(next_obs_batch).max(1)[0]
+            next_max_Q_values = Variable(torch.zeros(batch_size).type(dtype))
+            # # Detach variable from the current graph since we don't want gradients to propagated
+            next_max_Q_values[done_mask == 0] = target_Q(next_obs_batch).detach().max(1)[0]
             # Compute Bellman error, use huber loss to mitigate outlier impact
             bellman_error = F.smooth_l1_loss(current_Q_values, rew_batch + (gamma * next_max_Q_values))
             # Run backward pass and clip the gradient
+            Q.zero_grad()
             bellman_error.backward()
-            nn.utils.clip_grad_norm(Q.parameters(), grad_norm_clipping)
+
+            if check_norm(Q) > grad_norm_clipping:
+                print('Before clipping gradient:')
+                print('total_norm: ', check_norm(Q))
+                nn.utils.clip_grad_norm(Q.parameters(), grad_norm_clipping)
+                print('After clipping gradient:')
+                print('total_norm: ', check_norm(Q))
             # Perfom the update
             optimizer = construct_optimizer(t)
             optimizer.step()
+            # print('After update Q:')
+            # check_norm(Q)
             num_param_updates += 1
 
             # Periodically update the target network by Q network to target Q network
             if num_param_updates % target_update_freq == 0:
+                # print('Before update target:')
+                # check_norm(target_Q)
                 for target_param, param in zip(target_Q.parameters(), Q.parameters()):
                     target_param.data = param.data.clone()
+                # print('After update target:')
+                # check_norm(target_Q)
 
         ### 4. Log progress
         episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
diff --git a/dqn_model.py b/dqn_model.py
@@ -7,7 +7,7 @@ def __init__(self, in_channels=4, num_actions=18):
         Initialize a deep Q-learning network as described in
         https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
         Arguments:
-            input_channel: number of channel of input.
+            in_channels: number of channel of input.
                 i.e The number of most recent frames stacked together as describe in the paper
             num_actions: number of action-value to output, one-to-one correspondence to action in game.
         """
@@ -24,3 +24,22 @@ def forward(self, x):
         x = F.relu(self.conv3(x))
         x = F.relu(self.fc4(x.view(x.size(0), -1)))
         return self.fc5(x)
+
+class DQN_RAM(nn.Module):
+    def __init__(self, in_features=4, num_actions=18):
+        """
+        Initialize a deep Q-learning network for testing algorithm
+            in_features: number of features of input.
+            num_actions: number of action-value to output, one-to-one correspondence to action in game.
+        """
+        super(DQN_RAM, self).__init__()
+        self.fc1 = nn.Linear(in_features, 256)
+        self.fc2 = nn.Linear(256, 128)
+        self.fc3 = nn.Linear(128, 64)
+        self.fc4 = nn.Linear(64, num_actions)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        return self.fc4(x)
diff --git a/main.py b/main.py
@@ -21,16 +21,6 @@ def stopping_criterion(n):
     return lambda env:  get_wrapper_by_name(env, "Monitor").get_total_steps() >= n
 
 def main(env, num_timesteps):
-    # Get Atari games.
-    benchmark = gym.benchmark_spec('Atari40M')
-
-    # Change the index to select a different game.
-    task = benchmark.tasks[3]
-
-    # Run training
-    seed = 0 # Use a seed of zero (you may want to randomize the seed!)
-    env = get_env(task, seed)
-
     # This is just a rough estimate
     num_iterations = float(num_timesteps) / 4.0
 
diff --git a/ram.py b/ram.py
@@ -0,0 +1,71 @@
+import gym
+import torch.optim as optim
+
+from dqn_model import DQN_RAM
+from dqn_learn import OptimizerSpec, dqn_learing
+from utils.gym import get_ram_env, get_wrapper_by_name
+from utils.schedule import PiecewiseSchedule, LinearSchedule
+
+BATCH_SIZE = 32
+GAMMA = 0.99
+REPLAY_BUFFER_SIZE=1000000
+LEARNING_STARTS=50000
+LEARNING_FREQ=4
+FRAME_HISTORY_LEN=1
+TARGER_UPDATE_FREQ=10000
+GRAD_NORM_CLIPPING=10
+
+def stopping_criterion(n):
+    # notice that here t is the number of steps of the wrapped env,
+    # which is different from the number of steps in the underlying env
+    return lambda env:  get_wrapper_by_name(env, "Monitor").get_total_steps() >= n
+
+def main(env, num_timesteps=int(4e7)):
+    # This is just a rough estimate
+    num_iterations = float(num_timesteps) / 4.0
+
+    # define learning rate and exploration schedules below
+    lr_multiplier = 1.0
+    lr_schedule = PiecewiseSchedule([
+        (0, 1e-4 * lr_multiplier),
+        (num_iterations / 10, 1e-4 * lr_multiplier),
+        (num_iterations / 2,  5e-5 * lr_multiplier),
+    ], outside_value=5e-5 * lr_multiplier)
+
+    optimizer = OptimizerSpec(
+        constructor=optim.Adam,
+        kwargs=dict(eps=1e-4),
+        lr_schedule=lr_schedule
+    )
+
+    exploration_schedule = PiecewiseSchedule([
+        (0, 0.2),
+        (1e6, 0.1),
+        (num_iterations / 2, 0.01),
+    ], outside_value=0.01)
+
+    dqn_learing(
+        env=env,
+        q_func=DQN_RAM,
+        optimizer_spec=optimizer,
+        exploration=exploration_schedule,
+        stopping_criterion=stopping_criterion(num_timesteps),
+        replay_buffer_size=REPLAY_BUFFER_SIZE,
+        batch_size=BATCH_SIZE,
+        gamma=GAMMA,
+        learning_starts=LEARNING_STARTS,
+        learning_freq=LEARNING_FREQ,
+        frame_history_len=FRAME_HISTORY_LEN,
+        target_update_freq=TARGER_UPDATE_FREQ,
+        grad_norm_clipping=GRAD_NORM_CLIPPING
+    )
+
+if __name__ == '__main__':
+    # Get Atari games.
+    env = gym.make('Pong-ram-v0')
+
+    # Run training
+    seed = 0 # Use a seed of zero (you may want to randomize the seed!)
+    env = get_ram_env(env, seed)
+
+    main(env)
diff --git a/utils/gym.py b/utils/gym.py
@@ -5,7 +5,7 @@
 from gym import wrappers
 
 from utils.seed import set_global_seeds
-from utils.atari_wrapper import wrap_deepmind
+from utils.atari_wrapper import wrap_deepmind, wrap_deepmind_ram
 
 def get_env(task, seed):
     env_id = task.env_id
@@ -21,6 +21,16 @@ def get_env(task, seed):
 
     return env
 
+def get_ram_env(env, seed):
+    set_global_seeds(seed)
+    env.seed(seed)
+
+    expt_dir = '/tmp/gym-results'
+    env = wrappers.Monitor(env, expt_dir, force=True)
+    env = wrap_deepmind_ram(env)
+
+    return env
+
 def get_wrapper_by_name(env, classname):
     currentenv = env
     while True: