-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFiles.py
179 lines (150 loc) · 5.42 KB
/
Files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import os
import csv
import asyncio
import random
import re
import string
import logging
from user_agent import generate_user_agent
from aiohttp import ClientSession
from bs4 import BeautifulSoup
Words = []
IgnoreWords = []
IgnoreLinks = []
_Processed = []
Formats = ["sql","txt"]
_Key = {
"WordList.txt": Words,
"IgnoreWords.txt": IgnoreWords,
}
MaxWords = 50
logging.basicConfig(level=logging.INFO)
Log = logging.getLogger("INDEXER: ")
if hasattr(asyncio, "WindowsSelectorEventLoopPolicy"):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
for key in _Key.keys():
if os.path.exists(key):
with open(key, "r") as f:
_Key[key].extend(f.read().split("\n"))
if key == "WordList.txt":
os.remove(key)
RandomWordsCount = 35
def judge_content(text) -> bool:
if any(_[::-1] in text.lower() for _ in ["xes", "nrop", "soedivx"]):
return False
return True
async def get_random_words_from_api():
API = ["https://random-word-api.herokuapp.com/word"]
_api = random.choice(API)
async with ClientSession() as ses:
for _ in range(RandomWordsCount):
async with ses.get(_api) as out:
word = (await out.json())[0]
while word in IgnoreWords:
async with ses.get(_api) as out:
word = (await out.json())[0]
Words.append(word)
class EndProgram(Exception):
...
def save_files(content, path=""):
for ft in content.keys():
with open(f"{path}{ft.upper()}_files.csv", "a") as f:
writer = csv.writer(f)
for line in content[ft]:
writer.writerow(line)
async def fetch_files(word):
if len(_Processed) == MaxWords:
raise EndProgram
if word in IgnoreWords:
return
Log.info(f"---> Fetching {word}!")
IgnoreWords.append(word)
Content = {}
folder = word[0].upper()
async with ClientSession() as ses:
async def get_content_from_format(filetype):
url = f"https://google.com/search?q={word}+filetype:{filetype}"
async def get_page(url_):
res = await ses.get(
url_,
headers={"User-Agent": generate_user_agent()},
)
ct = await res.read()
# open("test.html", "wb").write(ct)
soup = BeautifulSoup(ct, "html.parser", from_encoding="utf-8")
find = soup.find_all("div", re.compile("egMi0"))
return find
get_ = await get_page(url)
start = 0
while get_:
for res in get_:
try:
name = res.find("div", re.compile("vvjwJb")).text
fileurl = res.find("a", href=re.compile("/url?"))
fileurl = (
fileurl["href"]
.split("url=")[1]
.split("&")[0]
.split("&ved=")[0]
.strip()
)
if judge_content(fileurl) and judge_content(name) and fileurl not in IgnoreLinks:
if not os.path.exists(folder):
os.mkdir(folder)
if Content.get(filetype):
Content[filetype].append([name, fileurl])
else:
Content.update({filetype: [[name, fileurl]]})
IgnoreLinks.append(fileurl)
Log.info(f"--> GOT ---> FROM WORD --> {word}---> {fileurl}")
for word_ in name.split():
cont = True
for c in word_:
if c not in string.ascii_letters:
cont = False
break
if cont and len(word_) > 3 and word_ not in IgnoreWords:
if word not in Words:
Log.info(f"---> Got new word --> {word_}")
Words.append(word)
except Exception as eR:
# raise eR
Log.error(eR)
if len(get_) == 10:
start += 10
get_ = await get_page(url + f"&start={start}")
task = []
for i in Formats:
task.append(get_content_from_format(i))
await asyncio.gather(*task)
_Processed.append(word)
if word in Words:
Words.remove(word)
if Content:
word = folder + word[1:]
if not os.path.exists(folder + f"/{word}"):
os.mkdir(folder + f"/{word}")
save_files(Content, path=f"{folder}/{word}/")
async def main():
Log.info("> Starting UP!")
await get_random_words_from_api()
while Words:
task = []
for word in Words[:3]:
task.append(fetch_files(word))
await asyncio.gather(*task)
await asyncio.sleep(2)
# await asyncio.gather(*task)
try:
asyncio.run(main())
except (KeyboardInterrupt, EndProgram):
pass
except Exception as er:
# raise er
Log.info(er)
for _ in _Key.keys():
with open(_, "w") as f:
if not _Key[_]:
os.remove(_)
else:
f.write("\n".join(sorted(_Key[_])))