-
Notifications
You must be signed in to change notification settings - Fork 0
/
figures.py
238 lines (193 loc) · 12.3 KB
/
figures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
from collections import Counter
import cv2
from datetime import datetime
from inspect import signature
from lmfit import Model, Parameter
import matplotlib.pyplot as plt
import numpy as np
import os
from input import COLORS, NUMBERS, PLT_STYLE
class Games:
__slots__ = ('data', 'general')
def __init__(self):
self.data: list[list[datetime, ], ] = [[] for _ in range(6)]
self.general: list[tuple[datetime, int, ]] = []
@staticmethod
def _from_input(inp) -> list[int,] or range: # takes an input and returns a list of ints (indexes)/range
if inp is None:
return range(6)
t = type(inp)
if t is range: # return inp immediately if inp is a range object
return inp
inp = list(inp) if t is list or t is set else [inp] # if inp is an int or a str make inp a list with length 1
return [NUMBERS.index(i) if type(i) is str else i for i in inp] # if inp is a str return its index in NUMBERS
@staticmethod
def gaussian(x, sigma: float, mu: float): # fitting function
return np.exp(-(x - mu)**2/(2 * sigma**2))/(sigma * np.sqrt(2 * np.pi))
@staticmethod
def linmap(x: float, from_range: tuple[float, float], to_range: tuple[float, float]) -> float: # map from one range to another
return to_range[0] + (x - from_range[0]) * (to_range[1] - to_range[0]) / (from_range[1] - from_range[0])
@staticmethod
def str(x: int, n: int) -> str: # convert an int to fixed length str
x = str(x)
return (n - len(x)) * '0' + x
def _fit(self, x, y, mu: float) -> tuple[Parameter, Parameter]:
args = signature(self.gaussian).parameters.keys() # get arguments that gaussian takes
model = Model(self.gaussian) # use gaussian as fitting function
result = model.fit(y, x=x, sigma=300, mu=mu) # σ = 300 seems to be a good initial value
return result.params['sigma'], result.params['mu'] # return σ and μ as Parameter objects
def player_most(self, n: int = 1, inp=None) -> list[tuple[str, int], ]: # the n most common players in self.data
data = []
for i in self._from_input(inp):
for j in self.data[i]:
for k in j[1]:
data.append(k)
counter = Counter(data)
return counter.most_common(n)
def ratings(self, inp=None, start: datetime = None, end: datetime = None) -> list[int, ]: # list of ints (ratings), use min() and max() to get minimum and maximum rating
start = datetime(2000, 1, 1) if start is None else start
end = datetime(3000, 1, 1) if end is None else end
return [sum((j[2] for j in self.data[i] if start <= j[0] < end), []) for i in self._from_input(inp)] # add up all the lists with ratings if between start and end
def read(self, games_path: str, log_path: str) -> None:
self.read_games(games_path)
self.read_log(log_path)
def read_games(self, path: str) -> None: # read a .csv with the games
with open(path, 'r') as f:
lines = [line.split(',') for line in f.read().splitlines()] # read into list of lists
for line in lines[1:]:
names = line[4:6] # the name of black and white
ratings = [int(rating) for rating in line[6:8] if rating != '?'] # the ratings of black and white
i = int(line[0]) # type of checkmate
self.data[i].append((datetime.strptime(line[2] + ' ' + line[3], "%Y.%m.%d %H:%M:%S"), names, ratings)) # append alongside with time played
self.data.sort(key=lambda i: i[0]) # sort by time
def read_log(self, path: str) -> None: # read a .log
with open(path, 'r') as f:
lines = [line.split(',') for line in f.read().splitlines()] # read into list of lists
for line in lines[1:]:
self.general.append(((datetime.strptime(line[1], '%Y-%m')), float(line[2])) + tuple(int(i) for i in line[3:])) # datetime.strptime(line[1], '%Y-%m') doesn't strip 'datetime', but 'month' from the log file
def p_ratings(self, start: datetime = None, end: datetime = None, inp=None, fit: bool = False, dots: bool = True, rating_interval: float = 20, llim: float = None, rlim: float = None, ylim: tuple[float, float] = None, show: bool = True) -> list[list[list[tuple, tuple], str, str]]: # plots the amount of players over the rating (approximately normal distribution)
plt.close()
plt.style.use(PLT_STYLE)
if ylim is not None:
plt.ylim(*ylim)
ratings = self.ratings(inp, datetime(2013, 1, 1) if start is None else start, datetime(2022, 1, 1) if end is None else end)
plt.xlabel('rating')
plt.ylabel('probability')
plt.xlim(min(min(r) for r in ratings) - 100 if llim is None else llim, max(max(r) for r in ratings) + 100 if rlim is None else rlim) # set x-boundaries
data = [[[(None, None), (None, None)], NUMBERS[i], COLORS[i]] for i in self._from_input(inp)] # '(None, None), (None, None)' so that it can be unpacked in 'for ((x_p, y_p), (sigma, mu)), label, color in data:' without errors even if 'dots' or 'fit' is False
for i, r in enumerate(ratings):
min_r, max_r = min(r), max(r) # minimum/maximum rating with checkmate type 'NUMBERS[i]'
_len = max_r - min_r + 1
x, y = np.arange(min_r, max_r + 1, 1), np.zeros(_len) # x = [min(r), min(r) + 1, ..., max(r) - 1, max(r)], y = [0, 0, ..., 0, 0], both have length max(r) - min(r) + 1
for k, n in Counter(r).items(): # Counter counts how many occurrences of each rating is
y[k - min_r] = n # set every rating
y[1500 - min_r] = (y[1499 - min_r] + y[1501 - min_r])/2 # since the starting rating on lichess is 1500, there is a spike at that rating, therefore the the average of rating = 1499 and rating = 1501 is taken
y /= y.sum() # normalize such that the area of the gaussian is 1
if dots:
x_plot, y_plot = [], []
for j in range(0, _len, rating_interval):
y_val = y[j:min(j + rating_interval, _len)].sum()/rating_interval # average of the ratings in interval [j, j + rating_interval) to reduce noise
if y_val == 0: # don't add point when there are no players in with rating in interval y_val
continue
x_plot.append(j + min_r) # add
y_plot.append(y_val) # point
data[i][0][0] = (x_plot, y_plot)
if fit:
params = self._fit(x, y, x.mean()) # pass mean as initial value for μ, otherwise lmfit raises an Error caused by NaN values, fit the data and get the parameters σ and μ
data[i][0][1] = params
for ((x_p, y_p), (sigma, mu)), label, color in data:
if dots:
plt.plot(x_p, y_p, 'o', ms=6, label=label, color=color, markeredgecolor='w', markeredgewidth=1) # plot points
if fit:
x_f = np.linspace(*plt.xlim(), 800) # x_fit
y_f = self.gaussian(x_f, sigma, mu)
plt.plot(x_f, y_f, lw=2, label=f'{label}: σ = {round(sigma.value, 2)} ± {round(sigma.stderr, 2)}, μ = {round(mu.value, 2)} ± {round(mu.stderr, 2)}', color=color) # plot gaussian
plt.legend()
if show:
plt.show()
elif type(show) is tuple:
plt.title(start.year)
fig = plt.gcf()
fig.set_size_inches(*save[1:])
fig.tight_layout()
fig.savefig(save[0], dpi=100, transparent=True)
def games_sum(self, first_year: int = 2013, last_year: int = 2021, inp=None) -> None:
plt.style.use(PLT_STYLE)
plt.ylabel('games')
plt.title(f'{first_year} - {last_year}')
start = datetime(first_year, 1, 1) # set start datetime
end = datetime(last_year + 1, 1, 1) # set end datetime
for i in self._from_input(inp):
plt.bar(NUMBERS[i], sum(month[i + 5] for month in self.general if start <= month[0] < end), color=COLORS[i]) # sum of games if month between start and end
plt.tight_layout()
plt.show()
def games_t(self, inp=None, absolute: bool = False, show: bool = True) -> tuple[list[float, ], list[tuple[list[int, ], str, str]]]:
years = [2013 + i * 1 / 12 for i in range(len(self.general))] # comparable to numpy.linspace
data = [([], NUMBERS[i], COLORS[i]) for i in self._from_input(inp)] # list of: tuple of list (number of games), label and color
for month in self.general:
for i, j in enumerate(self._from_input(inp)):
g = month[j + 5] if absolute else [1000000 * month[j + 5] / month[4]] # divide through total number of games analyzed if not absolute
data[i][0].append(g)
if show:
plt.close()
plt.style.use(PLT_STYLE)
plt.xlabel('year')
y_label = 'games per month' if absolute else 'games per month [ppm]'
plt.ylabel(y_label)
for d, label, color in data:
plt.plot(years, d, label=label, c=color) # plot data
plt.legend()
plt.show()
return data, years
class Figures(Games):
def __init__(self, games_path: str, log_path: str):
super().__init__()
self.read(games_path, log_path)
def save_plt(self, path: str) -> None: # save current plt figure
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
fig.tight_layout()
fig.savefig(path, dpi=100, transparent=True)
print(f'[saved] {path}')
def rating_probability__year(self, inp=None, last_year: int = 2020, ylim: tuple[float, float] = None, fit: bool = True, dots: bool = True, rating_interval: float = 20):
plt.style.use(PLT_STYLE)
ratings = sum(self.ratings(inp), []) # get every occurrence of every rating into list
llim, rlim = min(ratings) - 100, max(ratings) + 100 # set x-boundaries
for year in range(2013, last_year + 1): # last_year should be included
for i in self._from_input(inp):
self.p_ratings(date(year, 1, 1), date(year + 1, 1, 1), i, fit, dots, rating_interval, llim, rlim, ylim, (f'p_{NUMBERS[i]}__{year}.png', 18.5, 10.5)) # dots, rating_interval, ylim are inputted
print(f'[saved] p_{NUMBERS[i]}__{year}.png')
def player_most__type(self, inp=None, n: int = 10):
plt.style.use(PLT_STYLE)
for i in self._from_input(inp):
raw = [] # get every occurrence of every name into list
for j in self.data[i]:
for k in j[1]:
raw.append(k)
last_m = 0
for j, (player, m) in enumerate(Counter(raw).most_common()): # Counter(raw).most_common() returns a list of tuples with every element and it's count
if j >= n and m != last_m: # checking for m != last_m is (imo) necessary because without it names that are at index n, n + 1 and so on in the list but still have the same number as at index n - 1 won't be shown
break
last_m = m
plt.bar(player, m, color=COLORS[j % 6])
plt.ylabel('games')
self.save_plt(f'n_{NUMBERS[i]}__player.png')
plt.clf() # clf() to remove the existing bars
def number_games__month(self, inp=None, absolute: bool = False, month_interval: int = 3): # animating development in monthly games
plt.style.use(PLT_STYLE)
plt.xlabel('year')
plt.ylabel('games per month' if absolute else 'games per month [ppm]')
data, t = self.games_t(inp, absolute, False) # get x, y data
plt.xlim(t[0] - 0.3, t[-1] + 0.3) # set left, right and
plt.ylim(min(0, min(min(d[0]) for d in data)), 1.1 * max(max(d[0]) for d in data)) # bottom, top boundaries so that they stay constant
for n in range(0, len(data[0][0]), month_interval):
for d, label, color in data:
plt.plot(t[:n], d[:n], label=label, c=color) # plot number of games until n-th month
m = str(round(12 * (t[n] % 1)) + 1) # get months from years
self.save_plt(f'{label}__month_{int(t[n])}-{"0" if len(m) == 1 else ""}{m}')
def year(a: int, b: int = None) -> tuple[datetime, datetime]:
b = a + 1 if b is None else b + 1
return datetime(a, 1, 1), datetime(b, 1, 1)
if __name__ == '__main__':
games = Games()
games.read(DATA_CSV, LOG_FILE)