From 1addf5d5b4801a36ab98b800e0ef9a20a848ad92 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 14 Feb 2019 18:55:33 -0800 Subject: [PATCH] acl optimization: fixes ukwa/ukwa-pywb#39 - don't parse json on every aclj line until key prefix matches, resulting in speed boost! - convert aclj to dict (via cdxobject) only when match is found (disable aggregator source tracking) --- pywb/warcserver/access_checker.py | 21 ++++++++++++--------- pywb/warcserver/index/indexsource.py | 15 ++++++++------- tests/test_acl_manager.py | 4 ++-- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index 7502eb4dd..c648e4f95 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -15,8 +15,9 @@ class FileAccessIndexSource(FileIndexSource): def rev_cmp(a, b): return (a < b) - (a > b) - def _get_gen(self, fh, params): - return search(fh, params['key'], prev_size=1, compare_func=self.rev_cmp) + def _do_iter(self, fh, params): + for line in search(fh, params['key'], prev_size=1, compare_func=self.rev_cmp): + yield line # ============================================================================ @@ -75,26 +76,28 @@ def create_access_source(self, filename): raise Exception('Invalid Access Source: ' + filename) def find_access_rule(self, url, ts=None, urlkey=None): - params = {'url': url, 'urlkey': urlkey} + params = {'url': url, 'urlkey': urlkey, 'nosource': 'true'} acl_iter, errs = self.aggregator(params) if errs: print(errs) - key = params['key'].decode('utf-8') + key = params['key'] - tld = key.split(',')[0] + tld = key.split(b',')[0] for acl in acl_iter: # skip empty/invalid lines - if 'urlkey' not in acl: + if not acl: continue - if key.startswith(acl['urlkey']): - return acl + acl_key = acl.split(b' ')[0] + + if key.startswith(acl_key): + return CDXObject(acl) # if acl key already less than first tld, # no match can be found - if acl['urlkey'] < tld: + if acl_key < tld: break return self.default_rule diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index d30acccaa..b39a11d20 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -65,20 +65,21 @@ def _do_open(self, filename): except IOError: raise NotFoundException(filename) - def _get_gen(self, fh, params): - return iter_range(fh, params['key'], params['end_key']) - def load_index(self, params): filename = res_template(self.filename_template, params) fh = self._do_open(filename) - def do_load(fh): + def do_iter(): with fh: - for line in self._get_gen(fh, params): - yield CDXObject(line) + for obj in self._do_iter(fh, params): + yield obj + + return do_iter() - return do_load(fh) + def _do_iter(self, fh, params): + for line in iter_range(fh, params['key'], params['end_key']): + yield CDXObject(line) def __repr__(self): return '{0}(file://{1})'.format(self.__class__.__name__, diff --git a/tests/test_acl_manager.py b/tests/test_acl_manager.py index dc54b76f0..945c4beca 100644 --- a/tests/test_acl_manager.py +++ b/tests/test_acl_manager.py @@ -69,9 +69,9 @@ def test_acl_match(self, capsys): assert out == """\ Matched rule: - com,example, - {"access": "exclude", "url": "com,example,", "source": "%s", "source-coll": "%s"} + com,example, - {"access": "exclude", "url": "com,example,"} -""" % (self.acl_filename, self.acl_filename) +""" def test_remove_acl(self): wb_manager(['acl', 'remove', self.acl_filename, 'com,example,'])