-
Notifications
You must be signed in to change notification settings - Fork 0
/
CrawlerManager.py
137 lines (112 loc) · 5.24 KB
/
CrawlerManager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from Crawler import Crawler
import threading
import time
class CrawlerManager:
websiteDir = "websites/"
def __init__(self, websites, hops, numThreads=1):
self.numThreads = min(len(websites), numThreads)
self.hops = hops
self.chromeOptions = Options()
self.websites = websites
self.crawlers = []
self.threads = []
self.drivers = []
self.setup_options()
self.init_drivers()
self.allCookies = dict()
# Sets up the chrome options.
def setup_options(self):
print("Setting up chrome options...")
self.chromeOptions.add_argument("--headless")
gl_spoofed_user_agent = ("Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/70.0.3538.77 "
"Safari/537.36")
self.chromeOptions.add_argument("--user-agent=%s" % gl_spoofed_user_agent)
self.chromeOptions.add_argument("--disable-extensions")
self.chromeOptions.add_argument("ignore-certificate-errors")
self.chromeOptions.add_argument("incognito")
self.chromeOptions.add_argument("disable-gpu")
self.chromeOptions.add_argument("disable-xss-auditor")
# chrome_options.add_argument("disable-background-networking")
self.chromeOptions.add_argument("mute-audio")
# notifications
self.chromeOptions.add_argument("disable-notifications")
self.chromeOptions.add_argument("allow-running-insecure-content")
self.chromeOptions.add_argument('log-level=3')
# def setup_urls(self, files):
# print("Initializing urls...")
# for file in files:
# file_stream = open(self.websiteDir + file, "r")
# line = file_stream.readline().replace("\n", "")
# while line != "":
# self.urls.append(line)
# line = file_stream.readline().replace("\n", "")
def init_drivers(self):
for _ in range(self.numThreads):
self.drivers.append(webdriver.Chrome(options=self.chromeOptions))
def init_crawlers(self):
print("Initializing first wave of crawlers...")
for x in range(self.numThreads):
if len(self.websites) == 0:
print("No more websites left")
break
# was hopping
self.crawlers.append(Crawler(self.drivers[x], self.websites.pop(), hops=self.hops))
self.threads.append(threading.Thread(target=self.crawlers[x].start_crawl))
# Start initial wave
for thread in self.threads:
thread.start()
time.sleep(0.25) # To not overload cpu during loading
def start(self):
self.init_crawlers()
while self.websites:
self.deploy_crawlers() # Can be optimized by utilizing callbacks and assigning ids to crawlers...
time.sleep(0.5)
self.left_over_threads()
print("All threads finished")
# Checks which threads have died so new ones can be started
def deploy_crawlers(self):
for x in range(self.numThreads):
if not self.websites: # Checks if there are still websites left to visit
break
if not self.threads[x].is_alive(): # Starts a new thread
finished_crawler = self.crawlers[x]
self.extract_cookies(finished_crawler)
# Create new drivers.
self.drivers[x].close()
self.drivers[x] = webdriver.Chrome(options=self.chromeOptions)
# New crawler.
self.crawlers[x] = Crawler(self.drivers[x], self.websites.pop(), hops=self.hops)
self.threads[x] = threading.Thread(target=self.crawlers[x].start_crawl)
self.threads[x].start()
time.sleep(0.25)
# Cleans up all threads that are still running when all websites have been visited
def left_over_threads(self):
while True:
for x in range(self.numThreads):
if self.threads[x] and not self.threads[x].is_alive():
finished_crawler = self.crawlers[x]
if finished_crawler.websiteUrl in self.allCookies:
self.threads[x] = None
else:
self.extract_cookies(finished_crawler)
count = 0
# Checks if all threads have been removed
for x in range(self.numThreads):
if self.threads[x] is None:
count = count + 1
# print(count)
# If all threads are dead, we can stop checking
if count == self.numThreads:
break
time.sleep(1)
def extract_cookies(self, crawler):
self.allCookies[crawler.websiteUrl] = dict.fromkeys({'frontpage', 'hopped'})
self.allCookies[crawler.websiteUrl]['frontpage'] = crawler.frontpage_cookies
if self.hops:
self.allCookies[crawler.websiteUrl]['hopped'] = crawler.hopCookies
else:
self.allCookies[crawler.websiteUrl]['hopped'] = []