forked from runelite/runelite-wiki-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
api.py
64 lines (53 loc) · 1.63 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import json
import urllib.request
import urllib.parse
from typing import *
use_cache: bool = True
user_agent: Dict[str, str] = {"User-Agent": "Runelite Wiki Scraper/1.0 (+abex@runelite.net)"}
def get_wiki_api(args: Dict[str, str], continueKey: str) -> Iterator[Any]:
args["format"] = "json"
while True:
url = "https://oldschool.runescape.wiki/api.php?" + urllib.parse.urlencode(args)
print("Grabbing " + url)
with urllib.request.urlopen(urllib.request.Request(url, headers=user_agent)) as raw:
js = json.load(raw)
yield js
if "continue" in js:
args[continueKey] = js["continue"][continueKey]
else:
return
def query_category(category_name: str) -> Dict[str, str]:
"""
query_category returns a dict of page title to page wikitext
you can then use mwparserfromhell to parse the wikitext into
an ast
"""
cache_file_name = category_name + ".cache.json"
if use_cache and os.path.isfile(cache_file_name):
with open(cache_file_name, "r") as fi:
return json.load(fi)
pageids = []
for res in get_wiki_api(
{
"action": "query",
"list": "categorymembers",
"cmlimit": "500",
"cmtitle": "Category:" + category_name,
}, "cmcontinue"):
for page in res["query"]["categorymembers"]:
pageids.append(str(page["pageid"]))
pages = {}
for i in range(0, len(pageids), 50):
for res in get_wiki_api(
{
"action": "query",
"prop": "revisions",
"rvprop": "content",
"pageids": "|".join(pageids[i:i + 50]),
}, "rvcontinue"):
for id, page in res["query"]["pages"].items():
pages[page["title"]] = page["revisions"][0]["*"]
with open(cache_file_name, "w+") as fi:
json.dump(pages, fi)
return pages