-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathactive.py
36 lines (29 loc) · 1.03 KB
/
active.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import sys
from multiprocessing import Pool
import pandas as pd
from tqdm import tqdm
def download_content(url):
try:
command = "wget --recursive --level 1 --no-clobber --page-requisites --html-extension --convert-links " \
"--reject '*.apk,*.zip,*.exe,*.ico,*.gif,*.svg,*.jpg,*.jpeg,*.png,*.mp3,*.mp4,*.pdf,*.tgz,*.flv,*.avi,*.mpeg,*.iso'" \
"--restrict-file-names=windows --domains {} --no-parent https://{}".format(
url, url)
os.system(command)
return 1
except:
return 0
VM = int(sys.argv[1])
THREAD = int(sys.argv[2])
SEED_LIST = pd.read_csv(
'stp-5-wb-non-inset-200feed.csv').apex.tolist()
if __name__ == '__main__':
with Pool(THREAD) as p:
r = list(
tqdm(p.imap(download_content, SEED_LIST[(15000 * VM):(15000 + (15000 * VM))])))
if all(r):
print("success")
else:
print("some errors has occurred")
# python3 active.py [0,1,2,3] 128
# eg - python3 active.py 3 128