-
Notifications
You must be signed in to change notification settings - Fork 60
/
storeImages.py
112 lines (105 loc) · 4.85 KB
/
storeImages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pytesseract
from PIL import Image
import requests
import pickle
from io import BytesIO
import os
import random
class StoreImages:
def __init__(self,bookpath,proxyflag,resolution,tesserPath):
self.pageResolution = resolution
self.proxyFlag = proxyflag
self.bookPath = bookpath
self.tesserPath = tesserPath
self.imagePath = os.path.join(self.bookPath,'Images')
self.pagesFetched = {}
self.PageLinkDict = {}
if not os.path.exists(os.path.join(self.bookPath,'data','pagesFetched.pkl')):
path = os.path.join(bookpath, 'data', "pageLinkDict.pkl")
with open(path, 'rb') as ofile:
self.PageLinkDict = pickle.load(ofile)
else:
path = os.path.join(bookpath, 'data', "pagesFetched.pkl")
with open(path, 'rb') as ofile:
self.pagesFetched = pickle.load(ofile)
path = os.path.join(bookpath, 'data', "pageLinkDict.pkl")
with open(path, 'rb') as ofile:
allPages = pickle.load(ofile)
for page in allPages.keys():
if page not in self.pagesFetched.keys():
self.PageLinkDict[page] = allPages[page]
if not os.path.isdir(self.imagePath):
os.mkdir(self.imagePath)
with open('proxies.txt','r') as ofile:
self.plist = ofile.readlines()
def getProxy(self):
prox = random.choice(self.plist)
return prox.strip()
def resethead(self):
try:
req = requests.get("https://google.com")
self.head = {
'Host': 'books.google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.00',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'close',
'Cookie': "__Secure-ENID=" + str(req.cookies['__Secure-ENID']),
}
except:
pass
def pageEmpty(self,image):
im = Image.open(BytesIO(image))
width, height = im.size
im = im.resize((int(width / 5), int(height / 5)))
gray = im.convert('L')
bw = gray.point(lambda x: 0 if x < 250 else 255, '1')
try:
text = pytesseract.image_to_string(bw)
except:
pytesseract.pytesseract.tesseract_cmd = self.tesserPath
text = pytesseract.image_to_string(bw)
return text.replace('\n', " ") == 'image not available'
def getImages(self,retries):
self.resethead()
for pageData in self.PageLinkDict.keys():
try:
link = self.PageLinkDict[pageData]['src']
if not link:
continue
pageNumber = self.PageLinkDict[pageData]['order']
checkIfPageFetched = retries
while checkIfPageFetched > 0:
proxyFailed = False
if checkIfPageFetched != retries and self.proxyFlag :
proxy = self.getProxy()
proxyDict = {
"http": 'http://' + proxy,
"https": 'https://' + proxy,
}
print(f'Using {proxy} for the image of page {pageNumber}')
proxyFailed = False
try:
pageImage = requests.get(link + '&w=' + str(self.pageResolution), headers=self.head, proxies=proxyDict, verify=False)
except:
print('Could not connect with this proxy')
proxyFailed = True
else:
pageImage = requests.get(link + '&w=' + str(self.pageResolution), headers=self.head, verify=False)
if self.pageEmpty(pageImage.content) or proxyFailed:
self.resethead()
checkIfPageFetched -= 1
else:
checkIfPageFetched = -1
print(f'Fetched image for page {pageNumber}')
self.pagesFetched[pageData]=self.PageLinkDict[pageData]
im = Image.open(BytesIO(pageImage.content))
im.save(os.path.join(self.imagePath,str(pageNumber)+".png"))
else:
if(checkIfPageFetched==0):
print("Could not fetch the image for page " + str(pageNumber))
except Exception as e:
print(e)
with open(os.path.join(self.bookPath,'data','pagesFetched.pkl'),'wb+') as ofile:
pickle.dump(self.pagesFetched,ofile)