-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
112 lines (100 loc) · 3.81 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import pickle
import time
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from pywikiapi import wikipedia
from typing import List, Dict
from wordcloud import WordCloud, STOPWORDS
def save_script(module):
"""
Save all script files on module directory
:param module: A dict returned from Wikimedia API
"""
if not os.path.exists("module"):
os.mkdir("module")
with open(f"module/{module.title.replace('/', '_')}_{module.pageid}.lua", "w") as file:
file.write(module.wikitext)
def save_metainfo(pages: List[Dict], pages_error: List[int]):
"""
Save a python object with pickle module to a file
:param pages: A list of page info
:param pages_error: A list of pageid where errors occurred
"""
with open("pages.dat", "wb") as f:
pickle.dump(pages, f)
with open("pages_error.dat", "wb") as f:
pickle.dump(pages_error, f)
def get_info_from_files() -> List[Dict[str, int]]:
"""
Get information size and name from saved lua modules
:return: A list with title and size of lua module
"""
arr = []
for i in os.listdir("module"):
data = {'title': i, 'size': os.path.getsize(f"module/{i}")}
arr.append(data)
return arr
def get_pages() -> [List[Dict], List[int]]:
"""
Get all lua modules from the Wiki, exclude doc pages
:return: A list of page info and list of pageids where errors occurred
"""
site = wikipedia('en')
pages = []
modules_names = []
error_pages = []
# Asks 500 (max) per iteration lua modules pages for api
for r in site.query(list='allpages', apnamespace="828", aplimit="max"):
# Iterates in the results
for page in r.allpages:
# Check if a documentation file
if "/doc" not in page.title and "testcase" not in page.title and "Module:User:" not in page.title \
and page.title.split("/")[0] not in modules_names:
try:
# Not search submodules
modules_names.append(page.title.split("/")[0])
# Get module lua content
for module in site.iterate("parse", pageid=page.pageid, prop="wikitext"):
data = {'title': module.title, 'pageid': module.pageid, 'size': len(module.wikitext)}
pages.append(data)
print(f"{module.title} successfully added")
save_script(module)
# Wait 1 second
time.sleep(1)
except:
# Saves pages that have errors
error_pages.append(page.pageid)
print(f"An error occurred while downloading the module: {module.title}")
return pages, error_pages
def make_graphics(pages):
"""
Save histogram and word cloud
:param pages: A list of page info
"""
df = pd.DataFrame.from_dict(pages)
stopwords = set(STOPWORDS)
stopwords.update(["module", "Module", "ISO"])
px.histogram(df, x='size', labels={'x': "lua module size (bytes)", 'y': "Count Files"}).write_html(
"results/histogram.html")
words = WordCloud(background_color='white',
width=1024,
height=512,
stopwords=stopwords
).generate(' '.join(df['title']))
plt.imshow(words)
plt.axis('off')
plt.savefig('results/World_Cloud_module_name.png')
# Main
if __name__ == '__main__':
pages = []
if os.path.isfile("pages.dat"):
with open("pages.dat", "rb") as file:
pages = pickle.load(file)
elif os.path.isdir("module"):
pages = get_info_from_files()
else:
pages, pages_error = get_pages()
save_metainfo(pages, pages_error)
make_graphics(pages)