diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index 127ad10bb..7502eb4dd 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -76,21 +76,27 @@ def create_access_source(self, filename): def find_access_rule(self, url, ts=None, urlkey=None): params = {'url': url, 'urlkey': urlkey} - print("Getting acl_iter...') acl_iter, errs = self.aggregator(params) if errs: print(errs) key = params['key'].decode('utf-8') - print("Iterating acl_iter...') + tld = key.split(',')[0] + for acl in acl_iter: + # skip empty/invalid lines if 'urlkey' not in acl: continue if key.startswith(acl['urlkey']): return acl + # if acl key already less than first tld, + # no match can be found + if acl['urlkey'] < tld: + break + return self.default_rule def __call__(self, res): @@ -102,21 +108,24 @@ def wrap_iter(self, cdx_iter): last_url = None for cdx in cdx_iter: - print("Looking at",cdx) url = cdx.get('url') - print(url) # if no url, possible idx or other object, don't apply any checks and pass through if not url: yield cdx continue - rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey')) - print(rule) + # TODO: optimization until date range support is included + if url == last_url: + rule = last_rule + else: + rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey')) + access = rule.get('access', 'exclude') - print(access) if access == 'exclude': continue - print("Yielding...") cdx['access'] = access yield cdx + + last_rule = rule + last_url = url diff --git a/sample_archive/access/pywb.aclj b/sample_archive/access/pywb.aclj index b88959147..4808fb45d 100644 --- a/sample_archive/access/pywb.aclj +++ b/sample_archive/access/pywb.aclj @@ -1,5 +1,5 @@ org,iana)/about - {"access": "block"} org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"} org,iana)/_css - {"access": "exclude"} -org,example)/?example=1 - {"access": "block"} org,iana)/ - {"access": "exclude"} +org,example)/?example=1 - {"access": "block"}