forked from khpeek/Q-learning-Tic-Tac-Toe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Q_Learning_Tic_Tac_Toe.py
284 lines (240 loc) · 12.7 KB
/
Q_Learning_Tic_Tac_Toe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import numpy as np
import tkinter as tk
import copy
import pickle as pickle # cPickle is for Python 2.x only; in Python 3, simply "import pickle" and the accelerated version will be used automatically if available
class Game:
def __init__(self, master, player1, player2, Q_learn=None, Q={}, alpha=0.3, gamma=0.9):
frame = tk.Frame()
frame.grid()
self.master = master
master.title("Tic Tac Toe")
self.player1 = player1
self.player2 = player2
self.current_player = player1
self.other_player = player2
self.empty_text = ""
self.board = Board()
self.buttons = [[None for _ in range(3)] for _ in range(3)]
for i in range(3):
for j in range(3):
self.buttons[i][j] = tk.Button(frame, height=3, width=3, text=self.empty_text, command=lambda i=i, j=j: self.callback(self.buttons[i][j]))
self.buttons[i][j].grid(row=i, column=j)
self.reset_button = tk.Button(text="Reset", command=self.reset)
self.reset_button.grid(row=3)
self.Q_learn = Q_learn
if self.Q_learn:
self.Q = Q
self.alpha = alpha # Learning rate
self.gamma = gamma # Discount rate
self.share_Q_with_players()
@property
def Q_learn(self):
if self._Q_learn is not None:
return self._Q_learn
if isinstance(self.player1, QPlayer) or isinstance(self.player2, QPlayer):
return True
@Q_learn.setter
def Q_learn(self, _Q_learn):
self._Q_learn = _Q_learn
def share_Q_with_players(self): # The action value table Q is shared with the QPlayers to help them make their move decisions
if isinstance(self.player1, QPlayer):
self.player1.Q = self.Q
if isinstance(self.player2, QPlayer):
self.player2.Q = self.Q
def callback(self, button):
if self.board.over():
pass # Do nothing if the game is already over
else:
if isinstance(self.current_player, HumanPlayer) and isinstance(self.other_player, HumanPlayer):
if self.empty(button):
move = self.get_move(button)
self.handle_move(move)
elif isinstance(self.current_player, HumanPlayer) and isinstance(self.other_player, ComputerPlayer):
computer_player = self.other_player
if self.empty(button):
human_move = self.get_move(button)
self.handle_move(human_move)
if not self.board.over(): # Trigger the computer's next move
computer_move = computer_player.get_move(self.board)
self.handle_move(computer_move)
def empty(self, button):
return button["text"] == self.empty_text
def get_move(self, button):
info = button.grid_info()
move = (int(info["row"]), int(info["column"])) # Get move coordinates from the button's metadata
return move
def handle_move(self, move):
if self.Q_learn:
self.learn_Q(move)
i, j = move # Get row and column number of the corresponding button
self.buttons[i][j].configure(text=self.current_player.mark) # Change the label on the button to the current player's mark
self.board.place_mark(move, self.current_player.mark) # Update the board
if self.board.over():
self.declare_outcome()
else:
self.switch_players()
def declare_outcome(self):
if self.board.winner() is None:
print("Cat's game.")
else:
print(("The game is over. The player with mark {mark} won!".format(mark=self.current_player.mark)))
def reset(self):
print("Resetting...")
for i in range(3):
for j in range(3):
self.buttons[i][j].configure(text=self.empty_text)
self.board = Board(grid=np.ones((3,3))*np.nan)
self.current_player = self.player1
self.other_player = self.player2
# np.random.seed(seed=0) # Set the random seed to zero to see the Q-learning 'in action' or for debugging purposes
self.play()
def switch_players(self):
if self.current_player == self.player1:
self.current_player = self.player2
self.other_player = self.player1
else:
self.current_player = self.player1
self.other_player = self.player2
def play(self):
if isinstance(self.player1, HumanPlayer) and isinstance(self.player2, HumanPlayer):
pass # For human vs. human, play relies on the callback from button presses
elif isinstance(self.player1, HumanPlayer) and isinstance(self.player2, ComputerPlayer):
pass
elif isinstance(self.player1, ComputerPlayer) and isinstance(self.player2, HumanPlayer):
first_computer_move = player1.get_move(self.board) # If player 1 is a computer, it needs to be triggered to make the first move.
self.handle_move(first_computer_move)
elif isinstance(self.player1, ComputerPlayer) and isinstance(self.player2, ComputerPlayer):
while not self.board.over(): # Make the two computer players play against each other without button presses
self.play_turn()
def play_turn(self):
move = self.current_player.get_move(self.board)
self.handle_move(move)
def learn_Q(self, move): # If Q-learning is toggled on, "learn_Q" should be called after receiving a move from an instance of Player and before implementing the move (using Board's "place_mark" method)
state_key = QPlayer.make_and_maybe_add_key(self.board, self.current_player.mark, self.Q)
next_board = self.board.get_next_board(move, self.current_player.mark)
reward = next_board.give_reward()
next_state_key = QPlayer.make_and_maybe_add_key(next_board, self.other_player.mark, self.Q)
if next_board.over():
expected = reward
else:
next_Qs = self.Q[next_state_key] # The Q values represent the expected future reward for player X for each available move in the next state (after the move has been made)
if self.current_player.mark == "X":
expected = reward + (self.gamma * min(next_Qs.values())) # If the current player is X, the next player is O, and the move with the minimum Q value should be chosen according to our "sign convention"
elif self.current_player.mark == "O":
expected = reward + (self.gamma * max(next_Qs.values())) # If the current player is O, the next player is X, and the move with the maximum Q vlue should be chosen
change = self.alpha * (expected - self.Q[state_key][move])
self.Q[state_key][move] += change
class Board:
def __init__(self, grid=np.ones((3,3))*np.nan):
self.grid = grid
def winner(self):
rows = [self.grid[i,:] for i in range(3)]
cols = [self.grid[:,j] for j in range(3)]
diag = [np.array([self.grid[i,i] for i in range(3)])]
cross_diag = [np.array([self.grid[2-i,i] for i in range(3)])]
lanes = np.concatenate((rows, cols, diag, cross_diag)) # A "lane" is defined as a row, column, diagonal, or cross-diagonal
any_lane = lambda x: any([np.array_equal(lane, x) for lane in lanes]) # Returns true if any lane is equal to the input argument "x"
if any_lane(np.ones(3)):
return "X"
elif any_lane(np.zeros(3)):
return "O"
def over(self): # The game is over if there is a winner or if no squares remain empty (cat's game)
return (not np.any(np.isnan(self.grid))) or (self.winner() is not None)
def place_mark(self, move, mark): # Place a mark on the board
num = Board.mark2num(mark)
self.grid[tuple(move)] = num
@staticmethod
def mark2num(mark): # Convert's a player's mark to a number to be inserted in the Numpy array representing the board. The mark must be either "X" or "O".
d = {"X": 1, "O": 0}
return d[mark]
def available_moves(self):
return [(i,j) for i in range(3) for j in range(3) if np.isnan(self.grid[i][j])]
def get_next_board(self, move, mark):
next_board = copy.deepcopy(self)
next_board.place_mark(move, mark)
return next_board
def make_key(self, mark): # For Q-learning, returns a 10-character string representing the state of the board and the player whose turn it is
fill_value = 9
filled_grid = copy.deepcopy(self.grid)
np.place(filled_grid, np.isnan(filled_grid), fill_value)
return "".join(map(str, (list(map(int, filled_grid.flatten()))))) + mark
def give_reward(self): # Assign a reward for the player with mark X in the current board position.
if self.over():
if self.winner() is not None:
if self.winner() == "X":
return 1.0 # Player X won -> positive reward
elif self.winner() == "O":
return -1.0 # Player O won -> negative reward
else:
return 0.5 # A smaller positive reward for cat's game
else:
return 0.0 # No reward if the game is not yet finished
class Player(object):
def __init__(self, mark):
self.mark = mark
@property
def opponent_mark(self):
if self.mark == 'X':
return 'O'
elif self.mark == 'O':
return 'X'
else:
print("The player's mark must be either 'X' or 'O'.")
class HumanPlayer(Player):
pass
class ComputerPlayer(Player):
pass
class RandomPlayer(ComputerPlayer):
@staticmethod
def get_move(board):
moves = board.available_moves()
if moves: # If "moves" is not an empty list (as it would be if cat's game were reached)
return moves[np.random.choice(len(moves))] # Apply random selection to the index, as otherwise it will be seen as a 2D array
class THandPlayer(ComputerPlayer):
def __init__(self, mark):
super(THandPlayer, self).__init__(mark=mark)
def get_move(self, board):
moves = board.available_moves()
if moves:
for move in moves:
if THandPlayer.next_move_winner(board, move, self.mark):
return move
elif THandPlayer.next_move_winner(board, move, self.opponent_mark):
return move
else:
return RandomPlayer.get_move(board)
@staticmethod
def next_move_winner(board, move, mark):
return board.get_next_board(move, mark).winner() == mark
class QPlayer(ComputerPlayer):
def __init__(self, mark, Q={}, epsilon=0.2):
super(QPlayer, self).__init__(mark=mark)
self.Q = Q
self.epsilon = epsilon
def get_move(self, board):
if np.random.uniform() < self.epsilon: # With probability epsilon, choose a move at random ("epsilon-greedy" exploration)
return RandomPlayer.get_move(board)
else:
state_key = QPlayer.make_and_maybe_add_key(board, self.mark, self.Q)
Qs = self.Q[state_key]
if self.mark == "X":
return QPlayer.stochastic_argminmax(Qs, max)
elif self.mark == "O":
return QPlayer.stochastic_argminmax(Qs, min)
@staticmethod
def make_and_maybe_add_key(board, mark, Q): # Make a dictionary key for the current state (board + player turn) and if Q does not yet have it, add it to Q
default_Qvalue = 1.0 # Encourages exploration
state_key = board.make_key(mark)
if Q.get(state_key) is None:
moves = board.available_moves()
Q[state_key] = {move: default_Qvalue for move in moves} # The available moves in each state are initially given a default value of zero
return state_key
@staticmethod
def stochastic_argminmax(Qs, min_or_max): # Determines either the argmin or argmax of the array Qs such that if there are 'ties', one is chosen at random
min_or_maxQ = min_or_max(list(Qs.values()))
if list(Qs.values()).count(min_or_maxQ) > 1: # If there is more than one move corresponding to the maximum Q-value, choose one at random
best_options = [move for move in list(Qs.keys()) if Qs[move] == min_or_maxQ]
move = best_options[np.random.choice(len(best_options))]
else:
move = min_or_max(Qs, key=Qs.get)
return move