-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtictactoe.py
152 lines (139 loc) · 6.37 KB
/
tictactoe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import numpy as np
import matplotlib.pyplot as plt
import copy
import itertools
class State:
def __init__(self, grid, o_turn):
self.grid = np.array(grid)
self.o_turn = o_turn
self.winner = -1
def __eq__(self, __o: object):
return np.array_equal(self.grid, __o.grid) and self.o_turn == __o.o_turn
starting_state = State([[0,0,0],[0,0,0],[0,0,0]], True)
states = [starting_state]
actions = np.array(list(itertools.product((0,1,2),(0,1,2))))
probs = {}
probs[0] = np.full(len(actions), 1/len(actions))
q = {}
r = {}
q[0] = np.full(len(actions),0.0, dtype=float)
a = 0.03
def get_state_index(state : State):
index = False
try:
index = list(states).index(state)
except:
pass
return index
def draw_state(state : State):
for i in range(len(state.grid)):
for y in range(len(state.grid[0])):
print(state.grid[y,i], end="")
print("")
print("")
def check_winner(grid):
options = [[[0,0],[0,1],[0,2]],[[1,0],[1,1],[1,2]],[[2,0],[2,1],[2,2]],[[0,0],[1,0],[2,0]],[[0,1],[1,1],[2,1]],[[0,2],[1,2],[2,2]],[[0,0],[1,1],[2,2]],[[0,2],[1,1],[2,0]]]
blocked_options = 0
for option in options:
if(len(set([grid[a,b] for a,b in option])) == 1 and grid[tuple(option[0])] != 0): return grid[tuple(option[0])]
if((len(set([grid[a,b] for a,b in option])) == 2 and not 0 in set([grid[a,b] for a,b in option])) or (len(set([grid[a,b] for a,b in option])) == 3)): blocked_options+=1
if(blocked_options == len(options)): return 0
return -1
def do_step(state: State, action):
old_state_index = get_state_index(state)
new_state = copy.deepcopy(state)
if(not old_state_index):
states.append(new_state)
old_state_index = get_state_index(state)
if(not old_state_index in probs):
probs[old_state_index] = np.full(len(actions),1/len(actions), dtype=float)
q[old_state_index] = np.full(len(actions),0, dtype=float)
if(new_state.grid[action[0],action[1]] != 0):
action_index = actions.tolist().index(action.tolist())
r[tuple([old_state_index,action_index])] = -10
q[old_state_index][action_index] = -10
maximizing_indices = np.argwhere(q[old_state_index] == np.amax(q[old_state_index])).flatten()
probs[old_state_index] = np.full(len(actions), 0, dtype=float)
probs[old_state_index][np.array(maximizing_indices)] = 1/len(maximizing_indices)
return new_state
new_state.grid[action[0],action[1]] = 1 if new_state.o_turn else 2
new_state.o_turn = not new_state.o_turn
new_state.winner = check_winner(new_state.grid)
new_state_index = get_state_index(new_state)
if(not new_state_index):
states.append(new_state)
new_state_index = get_state_index(new_state)
if(not new_state_index in probs):
probs[new_state_index] = np.full(len(actions),1/len(actions))
q[new_state_index] = np.full(len(actions),0, dtype=float)
return new_state
def do_episode(draw = False):
current_state = copy.deepcopy(starting_state)
state_index = get_state_index(current_state)
chosen_action = actions[np.random.choice(len(actions), p=probs[state_index])]
sa = [[copy.deepcopy(current_state), copy.deepcopy(chosen_action)]]
winner = current_state.winner
while(winner == -1):
# draw_state(current_state)
action_index = actions.tolist().index(list(chosen_action.tolist()))
# print(f"q: {q[state_index]}")
# print(f"probs: {probs[state_index]}")
# print(f"chosen action: {chosen_action}")
# print(f"reward for action: {'None' if (state_index, action_index) not in r else r[(state_index, action_index)]}")
# print(f"action index: {action_index}")
current_state = do_step(current_state, chosen_action)
chosen_action = actions[np.random.choice(len(actions), p=probs[state_index])]
sa.append([copy.deepcopy(current_state), copy.deepcopy(chosen_action)])
# draw_state(current_state)
state_index = get_state_index(current_state)
# print(f"q: {q[state_index]}")
# print(f"probs: {probs[state_index]}")
winner = current_state.winner
sa.reverse()
for state, action in sa:
state_index = get_state_index(state)
action_index = actions.tolist().index(action.tolist())
if(draw): draw_state(state)
# print(f"action that is taken during after state above: {action}")
# print(f"winner: {state.winner}")
reward = 1 if (winner == 1 and state.o_turn == False) or (winner == 2 and state.o_turn == True) else -1 if (winner == 1 and state.o_turn == True) or (winner == 2 and state.o_turn == False) else 0
if((state_index, action_index) in r): reward = r[(state_index, action_index)]
# print(f"reward: {reward}")
q[state_index][action_index] += 0.03 * (reward - q[state_index][action_index])
# maximizing_index = np.argmax(q[state_index],0)
# print(f"q[state_index]: {q[state_index]}")
maximizing_indices = np.argwhere(q[state_index] == np.amax(q[state_index])).flatten()
# print(f"maximizing indices: {maximizing_indices}")
# print(f"q: {q[state_index]}")
# print(f"maximizing: {maximizing_index}")
# if(not len(set(q[state_index])) == 1):
probs[state_index] = np.full(len(actions), 0, dtype=float)
probs[state_index][np.array(maximizing_indices)] = (1-0.1)/len(maximizing_indices)
probs[state_index] += 0.1/len(probs[state_index])
# print(f"YEE: {probs[state_index]}")
# print(f"eee: {probs}")
# action = actions[5]
# print(action)
# print("INDEX ", actions.tolist().index(action.tolist()))
for i in range(100):
print(i)
do_episode(draw=False)
# do_episode(draw=True)
# print(f"all actions: {actions}")
# print(f"r: {r}")
def play():
state = copy.deepcopy(starting_state)
draw_state(state)
while(state.winner == -1):
draw_state(state)
opponent_action = actions[np.random.choice(len(actions), p=probs[get_state_index(state)])]
print(f"Opponent action: {opponent_action}")
state = do_step(state,opponent_action)
draw_state(state)
choice = input("Your move: ")
# print(choice)
action = [int(choice[0]),int(choice[1])]
state.grid[action[0],[action[1]]] = 1 if state.o_turn else 2
state.o_turn = not state.o_turn
print("winner: ", state.winner)
play()