werner-duvaud · ahainaut · Feb 9, 2021 · Jan 15, 2021
diff --git a/replay_buffer.py b/replay_buffer.py
@@ -79,8 +79,7 @@ def get_batch(self):
         ) = ([], [], [], [], [], [], [])
         weight_batch = [] if self.config.PER else None
 
-        for _ in range(self.config.batch_size):
-            game_id, game_history, game_prob = self.sample_game()
+        for game_id, game_history, game_prob in self.sample_n_games(self.config.batch_size):
             game_pos, pos_prob = self.sample_position(game_history)
 
             values, rewards, policies, actions = self.make_target(
@@ -154,6 +153,24 @@ def sample_game(self, force_uniform=False):
 
         return game_id, self.buffer[game_id], game_prob
 
+    def sample_n_games(self, n_games, force_uniform=False):
+        if self.config.PER and not force_uniform:
+            game_id_list = []
+            game_probs = []
+            for game_id, game_history in self.buffer.items():
+                game_id_list.append(game_id)
+                game_probs.append(game_history.game_priority)
+            game_probs = numpy.array(game_probs, dtype="float32")
+            game_probs /= numpy.sum(game_probs)
+            game_prob_dict = dict([(game_id, prob) for game_id, prob in zip(game_id_list, game_probs)])
+            selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs)
+        else:
+            selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
+            game_prob_dict = {}
+        ret = [(game_id, self.buffer[game_id], game_prob_dict.get(game_id))
+               for game_id in selected_games]
+        return ret
+
     def sample_position(self, game_history, force_uniform=False):
         """
         Sample position from game either uniformly or according to some priority.