-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathwwg.py
223 lines (194 loc) · 9.67 KB
/
wwg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Coded by Sam (alex@0xdeadcode.se)
# http://0xdeadcode.se
# Twitter: @_tmp0
import threading, Queue, urllib2, StringIO, re, sys, os, optparse, inspect, signal
reload(sys)
sys.setdefaultencoding("latin-1")
# Add Wikipedia links you do not wish the crawler to visit here
bad_urls = ['/wiki/Wikipedia:Tutorial/Registration', '/wiki/Wikipedia:Authority_control'\
'/wiki/Help:Contents', '/wiki/Help:Contents', \
'/wiki/Wikipedia:External_links','/wiki/Wikipedia:Contact_us' \
'/wiki/Wikipedia:Contact_us_-_Readers','/wiki/Wikipedia:Contact_us_-_Press', \
'/wiki/Press/Contact/Chapters', '/wiki/Wikipedia:General_disclaimer', \
'/wiki/Wikipedia:Stub', '/wiki/Wikipedia:Subpages', '/wiki/Wikipedia', \
'/wiki/Wikipedia_talk:Do_not_use_subpages', '/wiki/Terms_of_Use']
# Add/Remove any word or character you wish to skip
banned = [':', ';', '\\', '/', '[', '{', ']', '}', '&', '&', '*', '(', ')', '.', ',', '\'', '\'', '"', '#', '=', '”', '’',\
'Deltagarportalen', 'Wikipedia', 'Wikimedia', 'diskussionssidan', 'Utskriftsvänlig', 'engelskspråkiga', \
'användarvillkor', 'Grundprinciperna', 'ქართული', 'Ùارسی', '日本語', 'oʻzbekcha', 'العربية', '⇒', '·', \
'ifwindowmw', '한국어', 'മലയാളം', 'việt', 'тыла', 'Перем', 'kreyòl', 'ไทย', '粵語', 'پنجابی', 'Български', 'کوردی', \
'தமிழ்', 'བོད་ཡིག', 'Հայերեն', 'Авар', 'ᨅᨔ', 'తెలుగు', 'avañeẽ', 'తెలుగు', 'Српски', 'српскохрватски', 'မြန်မာဘာသာ', \
'ಕನ್ನಡ', 'Коми', 'Эрзянь', 'Чӑвашла', 'Беларуская', 'ଓଡ଼ିଆ', '⇐', 'ଓଡ଼ିଆ', 'documentwriteu003cdiv', 'Русский',\
'Македонски', 'Лакку', 'ܐܪܡܝܐ', 'فارسی', 'ትግርኛ', '客家語hak-kâ-ngî', 'ייִדיש', 'اردو', 'Ελληνικά', 'მარგალური', \
'ᨕᨘᨁᨗ', '贛語', 'Аҧсшәа', 'עברית', 'Українська', 'việt', '中文', 'ગુજરાતી', 'тарашкевіца', '文言', 'Ирон', 'नेपाल']
firstlayerqueue = Queue.Queue()
secondlayerqueue = Queue.Queue()
wordqueue = Queue.Queue()
class Crawl(threading.Thread):
def __init__(self, firstlayerqueue, secondlayerqueue, wordqueue):
threading.Thread.__init__(self)
self.firstlayerqueue = firstlayerqueue
self.secondlayerqueue = secondlayerqueue
self.wordqueue = wordqueue
def run(self):
self.url = self.firstlayerqueue.get()
#print 'IN THREAD: ' + self.url # uncomment to watch a lot of spam
try:
self.req = urllib2.Request(self.url, headers={'User-Agent' : "Mozilla/4.0(compatible; MSIE 7.0b; Windows NT 6.0)"}) # :)
self.con = urllib2.urlopen(self.req)
self.data = self.con.read()
except:
self.firstlayerqueue.task_done()
return 1
self.urls = self.getUrls(self.data)
self.data = self.getWords(self.data)
self.wordqueue.put(self.data)
for url in self.urls:
self.secondlayerqueue.put(url)
self.firstlayerqueue.task_done()
# please dont read this part :(
def getWords(self, test):
global banned, min, max
self.rv = []
self.test = test
self.skip = True
for lines in StringIO.StringIO(self.test):
lines = lines.strip('\n').strip('\n\r').strip('\t')
self.testa = re.sub('<.*?>', ' ', lines).split(' ')
for word in self.testa:
if word.find('<div id="content" class="mw-body" role="main">') :
skip = False
if skip == True and word.find('<span class="mw-headline" id="References">'):
skip = True
# 'wtf does this even do?' can't remember... probably nothing
pass
if len(word) >= min and len(word) <= max and skip == False:
for ban in banned:
try:
while 1:
word = word.replace(ban, '')
if word.find(ban) == -1:
break
except:
pass
if word == '' or word == ' ' or len(word) < min:
continue
else:
self.rv.append(word.lower())
else:
pass
return list(set(self.rv))
def getUrls(self, data):
global bad_urls
self.test = data
self.rv = []
for lineA in StringIO.StringIO(self.test):
match = re.findall(r'<a href=".*?">.+</a>', lineA)
if match:
match2 = re.findall(r'<a href="/w/.*?">.+</a>', lineA)
if match2 != True:
for i in match:
try:
reg = re.compile('/wiki/.*?"')
self.urlvalue = reg.search(i).group(0)
self.urlvalue.replace('"', '')
self.urlvalue = str(URLVALUE) + str(self.urlvalue).strip('"')
if self.urlvalue.endswith('.jpg') or self.urlvalue.endswith('.svg') or self.urlvalue.endswith('.png') or self.urlvalue.endswith('.gif') :
pass
elif '/wiki/Wikipedia:' in self.urlvalue or '/wiki/Portal:' in self.urlvalue or '/wiki/Special:' in self.urlvalue or '%' in self.urlvalue or '/wiki/Template' in self.urlvalue:
pass
else:
self.rv.append(self.urlvalue)
except Exception, e:
pass
else:
pass
return list(set(self.rv))
def writeWords():
global outputfile, words, wordqueue
while 1:
data = wordqueue.get()
for line in data:
try:
line_encoded = line.encode('ISO-8859-1')
#line_encoded = line.encode('UTF-8') # might want to uncomment $
except:
continue
f = open(outputfile, 'a')
f.write(line_encoded.lower() + '\n')
f.close()
words += 1
if wordqueue.empty():
break
##################
def handler(signum, frame): # http://stackoverflow.com/questions/1112343/how-do-i-capture-sigint-in-python
global words, outputfile
if not wordqueue.empty():
print '\nHold on cowboy, let me finish the running threads and dump the words into %s' % outputfile
writeWords()
print 'Done. Wrote %i words into %s' % (words, outputfile)
quit()
signal.signal(signal.SIGINT, handler)
###################
filename = os.path.split(inspect.getfile(inspect.currentframe()))
parser = optparse.OptionParser('Usage: ' + filename[1] + ' <args>' + '\nWikipedia Wordlist Generator by @_tmp0\nURL must be formated as following (most subdomains should work): '
'http://en.wikipedia.org/wiki/wikipage\n\nExample: python %s -u http://en.wikipedia.org/wiki/Europe -o wordlist.txt -t 5\nIf no minumum or max length is set the script will save words between 6 and 30 characters length'
'\n\nctrl+c to break\n\nI suggest doing something like this to clean the wordlist from duplicates:'
' sort -u wordlist.txt >> n_wordlist.txt' % filename[1])
parser.add_option('-u', dest='starturl', type='string', help='Wikipedia URL to use as start for the crawler')
parser.add_option('-t', dest='nrthreads', type='int', help='Amount of threads')
parser.add_option('-o', dest='outputfile', type='string', help='File to write output to')
parser.add_option('-m', dest='min', type='int', help='Minimum length of words')
parser.add_option('-M', dest='max', type='int', help='Maximum length of words')
(options, args) = parser.parse_args()
nrthreads = options.nrthreads
starturl = options.starturl
outputfile = options.outputfile
min = options.min
max = options.max
if starturl == None or outputfile == None or nrthreads == None:
print parser.print_help()
quit(0)
if min == None:
print '[!] No minimum length supplied. Setting minimum length to 6'
min = 6
if max == None:
print '[!] No maximum length supplied. Setting maximum length to 30'
max = 30
words = 0
URLVALUE = starturl.split('/wiki')[0]
bad_urls = [bad_url.replace(bad_url, str(URLVALUE) + str(bad_url)) for bad_url in bad_urls]
firstlayerqueue.put(starturl)
while 1: # generate first crawl content
thread = Crawl(firstlayerqueue, secondlayerqueue, wordqueue)
thread.daemon = True
thread.start()
if thread.isAlive():
break
int_count = 0
while 1:
if firstlayerqueue.empty():
while 1:
firstlayerqueue.put(secondlayerqueue.get())
if secondlayerqueue.empty():
writeWords()
print '\nWrote %i words to %s. Queue empty, filling...' % (words, outputfile)
words = 0
break
if not firstlayerqueue.empty():
alivethread = 0
for i in range(nrthreads):
if not firstlayerqueue.empty():
alivethread += 1
thread = Crawl(firstlayerqueue, secondlayerqueue, wordqueue)
thread.daemon = True
thread.start()
for i in range(alivethread):
thread.join(5)
int_count += 1
if int_count == 2:
print 'Joined %i threads. Queue size: %i' % (alivethread, firstlayerqueue.qsize())
int_count = 0
continue