forked from dbr/tvdb_api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtvdb_cache.py
270 lines (239 loc) · 8.19 KB
/
tvdb_cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/env python
#encoding:utf-8
#author:dbr/Ben
#project:tvdb_api
#repository:http://github.com/dbr/tvdb_api
#license:unlicense (http://unlicense.org/)
"""
urllib2 caching handler
Modified from http://code.activestate.com/recipes/491261/
"""
__author__ = "dbr/Ben"
__version__ = "1.9"
import os
import time
import errno
import http.client
import urllib.request, urllib.error, urllib.parse
from email import policy
import io
import pickle
from hashlib import md5
from threading import RLock
cache_lock = RLock()
def locked_function(origfunc):
"""Decorator to execute function under lock"""
def wrapped(*args, **kwargs):
cache_lock.acquire()
try:
return origfunc(*args, **kwargs)
finally:
cache_lock.release()
return wrapped
def calculate_cache_path(cache_location, url):
"""Checks if [cache_location]/[hash_of_url].headers and .body exist
"""
# ajp 17-10-14 >>>
#thumb = md5(url).hexdigest()
thumb = md5(url.encode('utf-8')).hexdigest()
# ajp 17-10-14 <<<
header = os.path.join(cache_location, thumb + ".headers")
body = os.path.join(cache_location, thumb + ".body")
return header, body
def check_cache_time(path, max_age):
"""Checks if a file has been created/modified in the [last max_age] seconds.
False means the file is too old (or doesn't exist), True means it is
up-to-date and valid"""
if not os.path.isfile(path):
return False
cache_modified_time = os.stat(path).st_mtime
time_now = time.time()
if cache_modified_time < time_now - max_age:
# Cache is old
return False
else:
return True
@locked_function
def exists_in_cache(cache_location, url, max_age):
"""Returns if header AND body cache file exist (and are up-to-date)"""
hpath, bpath = calculate_cache_path(cache_location, url)
if os.path.exists(hpath) and os.path.exists(bpath):
return(
check_cache_time(hpath, max_age)
and check_cache_time(bpath, max_age)
)
else:
# File does not exist
return False
@locked_function
def store_in_cache(cache_location, url, response):
"""Tries to store response in cache."""
hpath, bpath = calculate_cache_path(cache_location, url)
try:
outf = open(hpath, "wb")
# ajp 17-10-14 >>>
pickle.dump(response.info(), outf)
# ajp 17-10-14 <<<
outf.close()
# ajp 17-10-14 >>>
#outf = open(bpath, "wb")
outf = open(bpath, "w")
outfdata = response.read()
if not isinstance(outfdata, str):
outfdata = str(outfdata, 'utf-8')
#outf.write(response.read())
outf.write(outfdata)
# ajp 17-10-14 <<<
outf.close()
except IOError:
return True
else:
return False
@locked_function
def delete_from_cache(cache_location, url):
"""Deletes a response in cache."""
hpath, bpath = calculate_cache_path(cache_location, url)
try:
if os.path.exists(hpath):
os.remove(hpath)
if os.path.exists(bpath):
os.remove(bpath)
except IOError:
return True
else:
return False
class CacheHandler(urllib.request.BaseHandler):
"""Stores responses in a persistant on-disk cache.
If a subsequent GET request is made for the same URL, the stored
response is returned, saving time, resources and bandwidth
"""
@locked_function
def __init__(self, cache_location, max_age = 21600):
"""The location of the cache directory"""
self.max_age = max_age
self.cache_location = cache_location
if not os.path.exists(self.cache_location):
try:
os.mkdir(self.cache_location)
except OSError as e:
if e.errno == errno.EEXIST and os.path.isdir(self.cache_location):
# File exists, and it's a directory,
# another process beat us to creating this dir, that's OK.
pass
else:
# Our target dir is already a file, or different error,
# relay the error!
raise
def default_open(self, request):
"""Handles GET requests, if the response is cached it returns it
"""
if request.get_method() is not "GET":
return None # let the next handler try to handle the request
if exists_in_cache(
self.cache_location, request.get_full_url(), self.max_age
):
return CachedResponse(
self.cache_location,
request.get_full_url(),
set_cache_header = True
)
else:
return None
def http_response(self, request, response):
"""Gets a HTTP response, if it was a GET request and the status code
starts with 2 (200 OK etc) it caches it and returns a CachedResponse
"""
if (request.get_method() == "GET"
and str(response.code).startswith("2")
):
if 'x-local-cache' not in response.info():
# Response is not cached
set_cache_header = store_in_cache(
self.cache_location,
request.get_full_url(),
response
)
else:
set_cache_header = True
return CachedResponse(
self.cache_location,
request.get_full_url(),
set_cache_header = set_cache_header
)
else:
return response
#class CachedResponse(io.StringIO):
class CachedResponse(io.StringIO):
"""An urllib2.response-like object for cached responses.
To determine if a response is cached or coming directly from
the network, check the x-local-cache header rather than the object type.
"""
@locked_function
def __init__(self, cache_location, url, set_cache_header=True):
self.cache_location = cache_location
hpath, bpath = calculate_cache_path(cache_location, url)
# ajp 17-10-14 >>>
#io.StringIO.__init__(self, file(bpath, "rb").read())
io.StringIO.__init__(self, open(bpath, "r").read())
# ajp 17-10-14 <<<
self.url = url
self.code = 200
self.msg = "OK"
# ajp 17-10-14 >>>
#headerbuf = file(hpath, "rb").read()
self.headers = pickle.load(open(hpath, "rb"))
# ajp 17-10-14 <<<
if set_cache_header:
# ajp 17-10-14 >>>
#headerbuf += "x-local-cache: %s\r\n" % (bpath)
self.headers.policy = policy.default
self.headers["x-local-cache:"] = bpath
# ajp 17-10-14 <<<
def info(self):
"""Returns headers
"""
return self.headers
def geturl(self):
"""Returns original URL
"""
return self.url
@locked_function
def recache(self):
new_request = urllib.request.urlopen(self.url)
set_cache_header = store_in_cache(
self.cache_location,
new_request.url,
new_request
)
CachedResponse.__init__(self, self.cache_location, self.url, True)
@locked_function
def delete_cache(self):
delete_from_cache(
self.cache_location,
self.url
)
if __name__ == "__main__":
def main():
"""Quick test/example of CacheHandler"""
opener = urllib.request.build_opener(CacheHandler("/tmp/"))
response = opener.open("http://google.com")
response.recache()
# Test usage in threads
from threading import Thread
class CacheThreadTest(Thread):
lastdata = None
def run(self):
req = opener.open("http://google.com")
newdata = req.read()
if self.lastdata is None:
self.lastdata = newdata
assert self.lastdata == newdata, "Data was not consistent, uhoh"
req.recache()
threads = [CacheThreadTest() for x in range(50)]
print("Starting threads")
[t.start() for t in threads]
print("..done")
print("Joining threads")
[t.join() for t in threads]
print("..done")
main()