-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
executable file
·116 lines (91 loc) · 3.44 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
import html
import json
import os
import re
import time
from collections import defaultdict
DATA_DIR = "data"
OUT_DIR = "docs/json"
NO_POINTS = [(2020, 1), (2018, 6)]
def parse_year(year, users):
points = defaultdict(int)
ranks = defaultdict(list)
year_dir = os.path.join(DATA_DIR, str(year))
for day_file in os.listdir(year_dir):
day = int(re.fullmatch(r"^day(\d\d).html$", day_file)[1])
with open(os.path.join(year_dir, day_file)) as f:
content = f.read()
start = content.find(
'<p>First hundred users to get <span class="leaderboard-daydesc-both">both stars</span>'
)
middle = content.find(
'<p>First hundred users to get the <span class="leaderboard-daydesc-first">first star</span>'
)
end = content.find("</main>")
assert start > 0
assert middle > 0
assert end > 0
first = content[start:middle].split("\n")
second = content[middle:end].split("\n")[:-1]
for i, line in enumerate(first):
user_id = re.findall(r'data-user-id="(\d+)"', line)[0]
if (year, day) not in NO_POINTS:
points[user_id] += 100 - i
ranks[user_id].append((day, 2, i + 1))
start = line.rfind("anonymous user")
if start != -1:
end = line.find(")", start)
else:
start = line.rfind("</span>") + len("</span>")
end = line.find("<", start)
users[user_id]["name"] = html.unescape(line[start:end])
img = re.search(r'<img src="(.+?)"', line)
if img:
users[user_id]["img"] = img[1]
for i, line in enumerate(second):
user_id = re.findall(r'data-user-id="(\d+)"', line)[0]
if (year, day) not in NO_POINTS:
points[user_id] += 100 - i
ranks[user_id].append((day, 1, i + 1))
start = line.rfind("anonymous user")
if start != -1:
end = line.find(")", start)
else:
start = line.rfind("</span>") + len("</span>")
end = line.find("<", start)
users[user_id]["name"] = html.unescape(line[start:end])
img = re.search(r'<img src="(.+?)"', line)
if img:
users[user_id]["img"] = img[1]
people = {}
for user_id in points.keys():
people[user_id] = {
"points": points[user_id],
"ranks": ranks[user_id],
}
path = os.path.join(OUT_DIR, f"{year}.json")
with open(path, "w") as f:
json.dump(people, f)
print(f"Written to '{path}'")
def main():
os.makedirs(OUT_DIR, exist_ok=True)
years = sorted(os.listdir(DATA_DIR))
users = defaultdict(dict)
for year in years:
print("Parsing year", year)
parse_year(int(year), users)
path = os.path.join(OUT_DIR, "meta.json")
with open(path, "w") as f:
json.dump(
{
"years": list(map(int, years)),
"updated": int(time.time() * 1000),
"no_points": NO_POINTS,
"users": users,
},
f,
)
print(f"Written metadata to '{path}'")
if __name__ == "__main__":
main()