-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_negative_pictures.py
executable file
·81 lines (71 loc) · 2.41 KB
/
get_negative_pictures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""
dumpimages.py
Downloads all the images on the supplied URL,
saves to the current directory
Usage:
python3 get_negative_pictures.py http://www.mv.helsinki.fi/home/mokaukon/public_html/ [output]
"""
from bs4 import BeautifulSoup as bs
#import urllib.parse
try:
import urllib.request as urllib2
except ImportError:
import urllib2
from urllib.request import urlretrieve
from urllib.request import urlopen
from urllib.request import Request
try:
from urllib.parse import urlparse, urlunparse
except ImportError:
from urlparse import urlparse, urlunparse
#from urllib.request import urlparse
import os
import sys
def main(url, out_folder="./"):
"""Downloads all the images at 'url' to /test/"""
#headers = { 'User-Agent' : 'Mozilla/5.0' }
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
soup = bs(urlopen(req))
parsed = list(urlparse(url))
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print ('Downloading images to current working directory.')
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
try:
filename=each.split('/')[-1]
if url.startswith('http'):
print(each)
urlretrieve(each, filename)
else:
urlretrieve(url+'/'+each, filename)
except:
print('skipping', each)
#for image in soup.findAll("img"):
# print("Image: %(src)s" % image)
# filename = image["src"].split("/")[-1]
# parsed[2] = image["src"]
# outpath = os.path.join(out_folder, filename)
# print(image)
# if image["src"].lower().startswith("http"):
# urlretrieve(image["src"], outpath)
# print("filename1", outpath)
# else:
# print("filename2", outpath,urlunparse(parsed))
# #pass
# #urlretrieve(urlunparse(parsed), outpath)
# urlretrieve(image["src"], outpath)
def _usage():
print("usage: python dumpimages.py http://example.com [outpath]")
if __name__ == "__main__":
url = sys.argv[-1]
#out_folder = "/test/"
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
#main(url, out_folder)
main(url)