-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearchengine.py
325 lines (264 loc) · 13.9 KB
/
searchengine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
'''
GLOBAL TODO:
1) add weight 5 to words in the title - title needs to be scraped
2) some queries lead to an app error due to request timeouts... mongo documents are possibly not scalable
3) fix ui/ux
4) add a config variable to heroku heroku config:add PHANTOMJS_DIR=./vendor/phantomjs/bin/phantomjs then where it is in the code read it from the environment like we do the port and such and set the default to your local path for phantom
'''
import urllib2
from urlparse import urljoin
import pymongo
import re
import nn
import time
import os
ignorewords = set(['the', 'http','com','not','he', 'she', 'this', 'of', 'so', 'about', 'a', 'to', 'and','in','is', 'you', 'comments','it','points',':','hours','ago','days','months','years', 'point', 'reply','0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','deleted','comment','OP','op','repost','imgur'])
mynet = nn.searchnet()
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import InvalidElementStateException, TimeoutException, NoSuchElementException
#from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import InvalidElementStateException, TimeoutException, NoSuchElementException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def initialize_driver():
display=None
#display = Display(visible=0, size=(800, 600))
#display.start()
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87"
)
driver = webdriver.PhantomJS(executable_path='./vendor/phantomjs/bin/phantomjs', port=0, desired_capabilities=dcap) # PhantomJS located locally at /Users/shaunswanson/Downloads/phantomjs-1.9.2-macosx/bin #Located on heroku at ./vendor/phantomjs/bin/phantomjs
#profile = FirefoxProfile() # FOR TESTING
#profile.set_preference("dom.max_script_run_time", 600) # too short ???
#profile.set_preference("dom.max_chrome_script_run_time", 600) # too short ???
#profile.set_preference('permissions.default.image', 2) # disable images
#profile.set_preference('plugin.scan.plid.all', False) # disable plugin loading crap
#profile.set_preference('dom.disable_open_during_load', True) # disable popups
#profile.set_preference('browser.popups.showPopupBlocker', False)
#driver = webdriver.Firefox(profile)
#driver.set_window_size(1024, 768) # FOR TESTING
#driver.set_page_load_timeout(30)# FOR TESTING
return (driver, display)
class crawler:
# Initialize the crawler with the catbase database
def __init__(self):
connection_string = os.environ.get("MONGOLAB_URI", 'mongodb://localhost/catbase')
self.conn = pymongo.MongoClient(connection_string)
self.db = self.conn.heroku_app19457731
# print status of database
print "NUMBER OF EDGES IN NEURAL NETWORK: " + str(self.db.nn.count()) + '\n'
print "NUMBER OF WORDS IN DATABASE: " + str(self.db.words.count()) + '\n'
mywords = self.db.words.find()
urls = set()
for db_word in mywords:
#print "db_word['word']: " + str(db_word['word']) + '\n'
for db_url in db_word['picurls']:
urls.add(db_url['picurl'])
print "NUMBER OF PICTURES IN DATABASE: " + str(len(urls)) + '\n'
# Close the database
def __del__(self):
self.conn.close()
# Index an individual page
def addtoindex(self, content, picurl):
if self.isindexed(picurl):
print "already indexed!" + '\n'
return
# Get the individual words
text = content
#print "text: " + str(text) + '\n'
words = self.separatewords(text)
#print "words: " + str(words) + '\n'
# Link each word to the picurl on the page
if len(words) < 50: return
for i in range(len(words)):
#print "<-- i: " + str(i) + '\n'
#print "<- url: " + str(url) + '\n'
word = words[i]
word = word.lower()
#print "word: " + str(word) + '\n'
if word in ignorewords: continue
if word.find("http") != -1: continue
db_word = self.db.words.find_one({'word': word}) # assumes there's never a duplicate for a given word
#print "db_word: " + str(db_word) + '\n'
if db_word is not None:
j = 0
isFound = False
for db_url in db_word['picurls']:
if db_url['picurl'] == picurl:
#print "db_word['urls'][j]: " + str(db_word['urls'][j]) + '\n'
#print "db_word['urls'][j]['locations']: " + str(db_word['urls'][j]['locations']) + '\n'
templist = db_word['picurls'][j]['locations']
#print "templist: " + str(templist) + '\n'
if templist is not None:
templist.append(i)
else:
templist = [i]
db_word['picurls'][j]['locations'] = templist
self.db.words.save(db_word)
isFound = True
j += 1
if isFound == False:
db_word['picurls'] = db_word['picurls'] + [{'picurl': picurl,'locations': [i]}]
self.db.words.save(db_word)
#print "updated db_word: " + str(db_word) + '\n'
else:
wordJSON = {'word': word, 'picurls': [{'picurl': picurl, 'locations': [i]}]}
#print "created db_word: " + str(wordJSON) + '\n'
self.db.words.insert(wordJSON)
# Separate the words by any non-whitespace character
def separatewords(self, text):
splitter = re.compile('\\W*') # (TODO) improve this (possibly use a stemming algorithm to remove suffixes from words)
return [s.lower() for s in splitter.split(text) if s != '']
# Return True if this url is already indexed
def isindexed(self, url):
for db_word in self.db.words.find():
#print "each_word: " + str(db_word) + '\n'
#print "each_word['urls']: " + str(db_word['urls']) + '\n'
for db_url in db_word['picurls']:
#print "db_url: " + str(db_url) + '\n'
if db_url['picurl'] == url:
return True
return False
# Add a link between two pages
def addlinkref(self, urlFrom, urlTo, linkText):
pass
# Starting with a list of pages, do a breadth-first
# search to the given depth, indexing pages
# as we go
def crawl(self):
print "self.db.urls.count(): " + str(self.db.urls.count()) + '\n'
while self.db.urls.count() > 0:
db_page = self.db.urls.find_one()
page = db_page['url']
print "<-- crawling " + str(page) + '\n'
self.db.urls.remove({'url': page})
driver, display = initialize_driver()
try:
driver.get(page)
comment_content_elements = driver.find_elements_by_xpath('//div[contains(@id,"captions")]')
comment_content = None
if len(comment_content_elements) > 0:
comment_content = comment_content_elements[0].text.encode("utf-8", "ignore")
print "comment_content: " + str(comment_content) + '\n'
picurls = driver.find_elements_by_xpath('//div[contains(@class,"stipple-dottable-wrapper")]/img')
print "picurls: " + str(picurls) + '\n'
if len(picurls) < 1:
picurls = driver.find_elements_by_xpath('//div[contains(@id,"image")]/div/img')
if len(picurls) < 1:
picurls = driver.find_elements_by_xpath('//div[contains(@class,"image")]/div/div/a/img')
if len(picurls) < 1:
picurls = driver.find_elements_by_xpath('//div[contains(@class,"stipple-dottable-wrapper")]/a/img')
if len(picurls) > 0:
realpicurl = picurls[0].get_attribute('src')
if realpicurl.find("gif") != -1:
realpicurl = None
else:
realpicurl = None
print "realpicurl: " + str(realpicurl) + '\n'
links = driver.find_elements_by_xpath('//a[contains(@href,"gallery")]')
print "len(links): " + str(len(links)) + '\n'
# (TODO) update mongo collection serving as a queue for workers in parallel
for link in links:
reallink = link.get_attribute('href')
#print "reallink: " + str(reallink) + '\n'
if self.db.urls.find_one({'url': reallink}) is None:
self.db.urls.insert({'url': reallink})
except:
print "Could not open %s" % page
continue
#raise # FOR TESTING
if realpicurl is not None:
if comment_content is not None:
self.addtoindex(comment_content, realpicurl)
driver.close()
class searcher:
def __init__(self):
connection_string = os.environ.get("MONGOLAB_URI", 'mongodb://localhost/catbase')
self.conn = pymongo.MongoClient(connection_string)
self.db = self.conn.get_default_database()
def __del__(self):
self.conn.close()
def getunrankedmatches(self, q):
results = []
finalresults = set()
urls = set()
urls2 = set()
# Split the words by spaces
words = q.split(' ')
print "[getunrankedmatches] words: " + str(words) + '\n'
isFirst = True
for word in words:
db_word = self.db.words.find_one({'word': word}) # assumes there's never a duplicate for a given word
if db_word is not None:
for db_url in db_word['picurls']:
if isFirst == True:
urls.add(db_url['picurl'])
else:
urls2.add(db_url['picurl'])
if isFirst == False:
urls = urls.intersection(urls2)
print "[getunrankedmatches] urls: " + str(urls) + '\n'
isFirst = False
finalresults = urls
print "[getunrankedmatches] list(finalresults): " + str(list(finalresults)) + '\n'
return list(finalresults)
# take a dictionary of urls and scores and return a new dictionary with the same urls, but with scores between 0 and 1
def normalizescores(self, scores, smallIsBetter = 0):
vsmall = 0.00001 # Avoid division by zero errors
if smallIsBetter:
minscore = min(scores.values())
return dict([(u, float(minscore)/max(vsmall,1)) for (u,l) in scores.items()])
else:
if scores is not None and len(scores.values()) > 0:
maxscore = max(scores.values())
else:
maxscore = 0
if maxscore == 0: maxscore = vsmall
print "[normalizescores] normalizescores(scores): " + str(dict([(u, float(c)/maxscore) for (u,c) in scores.items()])) + '\n'
return dict([(u, float(c)/maxscore) for (u,c) in scores.items()])
def getscoredlist(self, words, results):
totalscores = dict([(result, 0.0) for result in results])
# (TODO) add more scores weighted with the neural network score
weights = [(0.35, self.frequencyscore(words, results)), (0.65, self.nnscore(words, results))]
print "[getscoredlist] weights: " + str(weights) + '\n'
for (weight, scores) in weights:
for url in totalscores:
totalscores[url] += weight*scores[url]
print "[getscoredlist] totalscores: " + str(totalscores) + '\n'
return totalscores
def query(self, q):
results = self.getunrankedmatches(q)
print "[query] results" + str(results) + '\n'
words = q.lower().split(' ')
scores = self.getscoredlist(words, results)
rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1)
for (score, url) in rankedscores[0:10]:
print '%f\t%s' % (score, url)
print "[query] [r[1] for r in rankedscores[0:10]]: " + str([r[1] for r in rankedscores[0:10]]) + '\n'
return words, [url[1] for url in rankedscores[0:10]]
def frequencyscore(self, words, urls):
counts = dict([(url,0) for url in urls])
for word in words:
db_word = self.db.words.find_one({'word': word}) # assumes there's never a duplicate for a given word
if db_word is not None:
for db_url in db_word['picurls']:
for url in urls:
if db_url['picurl'] == url:
counts[db_url['picurl']] += len(db_url['locations'])
print "[frequencyscore] counts: " + str(counts) + '\n'
return self.normalizescores(counts)
def nnscore(self, words, urls):
nnres = mynet.getresult(words, urls)
print "[nnscore] urls: " + str(urls) + '\n'
scores = dict([(urls[i], nnres[i]) for i in range(len(urls))])
print "[nnscore] scores: " + str(scores) + '\n'
return self.normalizescores(scores)