-
Notifications
You must be signed in to change notification settings - Fork 0
/
qud_grab.py
162 lines (148 loc) · 6.58 KB
/
qud_grab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from unittest.mock import NonCallableMagicMock
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import os, regex
from datetime import datetime
def dictify(ul):
result = {}
for li in ul.find_all("li", recursive=False):
key = next(li.stripped_strings)
ul = li.find("ul")
if ul:
result[key] = dictify(ul)
else:
result[key] = None
return result
def make_wiki_list(data, level=0): # TODO this should parse out the html somehow
if isinstance(data, str):
return f"{level * '*'} {data}\n"
else:
output = ""
if isinstance(data, set):
for item in data:
output += make_wiki_list(item, level+1)
elif isinstance(data, dict):
for item, children in data.items():
output += make_wiki_list(item, level+1)
output += make_wiki_list(children, level+1)
return output.replace(u'\xa0', ' ') # turn non-breaking space to space
class PatchNoteGrabber:
def __init__(self, output_folder='Outputs/qud_patch_notes', driver=None,
base_url='https://freeholdgames.itch.io/cavesofqud/devlog', date_format='%B %d, %Y',
separate_beta=True):
self.output_folder = output_folder
if not os.path.exists(self.output_folder):
os.mkdir(self.output_folder)
if driver is None:
self.driver = webdriver.Firefox()
else:
self.driver = driver
self.base_url = base_url
self.output_file = open(os.path.join(self.output_folder, 'qud_wiki_content.txt'), 'w')
if separate_beta:
self.beta_file = open(os.path.join(self.output_folder, 'qud_beta_content.txt'), 'w')
else:
self.beta_file = None
self.date_format = date_format
with open('last_date.txt', 'r') as date_file:
self.last_date = datetime.strptime(date_file.read(), self.date_format)
def output_selector(self, patch_type):
if self.beta_file is not None:
return patch_type
else:
return 'main'
def read_patch_notes(self, skip_headerless=True):
header_text = self.driver.find_element(by=By.XPATH, value='//section/h1').text
if 'beta' in header_text:
notes_type = 'beta'
else:
notes_type = 'main'
header_pattern = regex.compile(r'[^\n]+ - (?P<date>[^\n]+)')
match = header_pattern.match(header_text)
if match:
date = match.groupdict()['date']
try:
original_date = datetime.strptime(date, self.date_format)
except ValueError:
print('Could not parse', date, 'as date')
original_date = None
print(date)
elif skip_headerless:
return None, None
else:
date = 'TODO FILL IN DATE'
original_date = None
post_body = self.driver.find_element(by=By.XPATH, value="//section[contains(@class, 'post_body')]")
body_html = post_body.get_attribute('outerHTML')
soup = BeautifulSoup(body_html, features='lxml')
section = soup.body.section
output_strings = {
'main': '',
'beta': ''
}
added_section = False
for child in section.children:
if child.name == 'p':
for line in child.stripped_strings:
match = regex.match(r"[\d./]+( - '?beta'? branch)?", line)
if match:
if 'beta' in line:
notes_type = 'beta'
elif notes_type == 'beta':
print("GOT UNEXPECTED NEW ITEM DURING BETA?")
elif output_strings['main'] != '':
notes_type = 'beta'
if 'beta' not in line and notes_type == 'beta':
line += " - 'beta' branch"
output_strings[self.output_selector(notes_type)] += f'=== {line} ===\n'
output_strings[self.output_selector(notes_type)] += f'[{self.driver.current_url} Released {date}.]\n'
added_section = True
if child.name == 'ul':
output_strings[self.output_selector(notes_type)] += make_wiki_list(dictify(child))
if not added_section:
if notes_type == 'beta' and 'beta' not in date:
date += " (beta)"
header = f'=== {date} ===\n[{self.driver.current_url} Original patch notes]\n'
output_strings[self.output_selector(notes_type)] = header + output_strings[self.output_selector(notes_type)]
return original_date, output_strings
def read_most_recent(self, max=100, date_limit=True):
most_recent_date = None
self.driver.get(self.base_url) # could maybe be Steam
for i in range(max):
update_links = self.driver.find_elements(
by=By.CLASS_NAME, value='read_all_link')
update_links[i].click()
original_date, output_strings = self.read_patch_notes()
if most_recent_date is None:
most_recent_date = original_date
print('Date:', original_date)
if not date_limit or original_date is None or original_date > self.last_date:
if output_strings:
self.output_file.write(output_strings['main'])
if self.beta_file is not None:
self.beta_file.write(output_strings['beta'])
self.driver.back()
else:
print('Reached last date')
break
if most_recent_date is not None:
with open('last_date.txt', 'w') as date_file:
date_file.write(most_recent_date.strftime(self.date_format))
def read_specified(self, url):
self.driver.get(url)
original_date, output_strings = self.read_patch_notes(skip_headerless=False)
if output_strings:
self.output_file.write(output_strings['main'])
if self.beta_file is not None:
self.beta_file.write(output_strings['beta'])
def close(self):
self.driver.close()
if __name__ == '__main__':
# url = 'https://freeholdgames.itch.io/cavesofqud/devlog/348340/the-deep-jungle-feature-arc-is-here'
# read_one(url)
grab = PatchNoteGrabber()
grab.read_most_recent()
# grab.read_specified('https://freeholdgames.itch.io/cavesofqud/devlog/464757/the-moon-stair-is-here')
grab.close()
print('Done')