-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
50 lines (34 loc) · 1.44 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import numpy as np
import time
import json
from CrawlerManager import CrawlerManager
from typing import List, Dict
from post_processing import post_processing
import argparse
from statistics import fetch_statistics
def load_websites(path: str) -> List[str]:
return list(np.loadtxt(path, delimiter='\n', dtype='str'))
def write_cookies(path: str, cookies: Dict, ext: str):
out_path = 'out/' + path.split('/')[-1].split('.')[0] + ext
with open(out_path, 'w') as f:
json.dump(cookies, f, indent=2)
def start_crawling(num_of_threads, num_of_hops, path):
websites = load_websites(path)
print(websites)
crawler_manager = CrawlerManager(websites, num_of_hops, numThreads=num_of_threads)
crawler_manager.start()
all_cookies = crawler_manager.allCookies
write_cookies(path, all_cookies, ".json")
c = post_processing(all_cookies)
write_cookies(path, c, "-out.json")
fetch_statistics(c)
if __name__ == "__main__":
start = time.time()
parser = argparse.ArgumentParser()
parser.add_argument("input_path", help="input file path")
parser.add_argument("-t", "--threads", type=int, default=10, help="number of threads to use")
parser.add_argument("-n", "--num_hops", type=int, default=20, help="number of single depth hops on a website")
args = parser.parse_args()
start_crawling(args.threads, args.num_hops, args.input_path)
end = time.time() - start
print(f'Ran in {end:.3} seconds')