-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_seeds.py
56 lines (43 loc) · 1.49 KB
/
gen_seeds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import csv
from crawler.spiders.isixhosa import blocked, identify
import requests
import socket
import random
with open('google_secrets.txt') as f:
lines = f.readlines()
engine_id, key = lines[0].strip(), lines[1].strip()
words = set()
with open('words.csv') as csv_file:
rdr = csv.reader(csv_file, delimiter=',', quotechar='"')
for row in rdr:
words.add(row[2])
words = list(words)
random.shuffle(words)
sock = socket.socket()
sock.connect((socket.gethostname(), 7770))
urls = set()
# Only limit of 100 so need to exit early sometimes
try:
for _ in range(0, 100):
query = words.pop()
url = f"https://www.googleapis.com/customsearch/v1?key={key}&cx={engine_id}&q={query}"
results = requests.get(url).json()
if "items" not in results:
print(f"No results for {query}")
continue
else:
print(f"{results['searchInformation']['totalResults']} results for {query}")
for result in results["items"]:
url = result["link"]
if "snippet" not in result:
continue
is_isixhosa = identify(sock, result["snippet"])["language"] == "isiXhosa"
crawlable_document = not (url.endswith("pdf") or url.endswith("xlsx") or url.endswith("docx"))
if is_isixhosa and not blocked(url) and crawlable_document:
urls.add(url)
print(f"{len(urls)} in total")
except:
pass
f = open("seeds2.txt", "w")
f.write("\n".join(urls))
f.close()