Skip to content
This repository has been archived by the owner on Aug 2, 2024. It is now read-only.

Commit

Permalink
acl optimization: fixes ukwa/ukwa-pywb#39
Browse files Browse the repository at this point in the history
- don't parse json on every aclj line until key prefix matches, resulting in speed boost!
- convert aclj to dict (via cdxobject) only when match is found (disable aggregator source tracking)
  • Loading branch information
ikreymer committed Feb 15, 2019
1 parent 623f0da commit 1addf5d
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 18 deletions.
21 changes: 12 additions & 9 deletions pywb/warcserver/access_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ class FileAccessIndexSource(FileIndexSource):
def rev_cmp(a, b):
return (a < b) - (a > b)

def _get_gen(self, fh, params):
return search(fh, params['key'], prev_size=1, compare_func=self.rev_cmp)
def _do_iter(self, fh, params):
for line in search(fh, params['key'], prev_size=1, compare_func=self.rev_cmp):
yield line


# ============================================================================
Expand Down Expand Up @@ -75,26 +76,28 @@ def create_access_source(self, filename):
raise Exception('Invalid Access Source: ' + filename)

def find_access_rule(self, url, ts=None, urlkey=None):
params = {'url': url, 'urlkey': urlkey}
params = {'url': url, 'urlkey': urlkey, 'nosource': 'true'}
acl_iter, errs = self.aggregator(params)
if errs:
print(errs)

key = params['key'].decode('utf-8')
key = params['key']

tld = key.split(',')[0]
tld = key.split(b',')[0]

for acl in acl_iter:
# skip empty/invalid lines
if 'urlkey' not in acl:
if not acl:
continue

if key.startswith(acl['urlkey']):
return acl
acl_key = acl.split(b' ')[0]

if key.startswith(acl_key):
return CDXObject(acl)

# if acl key already less than first tld,
# no match can be found
if acl['urlkey'] < tld:
if acl_key < tld:
break

return self.default_rule
Expand Down
15 changes: 8 additions & 7 deletions pywb/warcserver/index/indexsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,21 @@ def _do_open(self, filename):
except IOError:
raise NotFoundException(filename)

def _get_gen(self, fh, params):
return iter_range(fh, params['key'], params['end_key'])

def load_index(self, params):
filename = res_template(self.filename_template, params)

fh = self._do_open(filename)

def do_load(fh):
def do_iter():
with fh:
for line in self._get_gen(fh, params):
yield CDXObject(line)
for obj in self._do_iter(fh, params):
yield obj

return do_iter()

return do_load(fh)
def _do_iter(self, fh, params):
for line in iter_range(fh, params['key'], params['end_key']):
yield CDXObject(line)

def __repr__(self):
return '{0}(file://{1})'.format(self.__class__.__name__,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_acl_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def test_acl_match(self, capsys):
assert out == """\
Matched rule:
com,example, - {"access": "exclude", "url": "com,example,", "source": "%s", "source-coll": "%s"}
com,example, - {"access": "exclude", "url": "com,example,"}
""" % (self.acl_filename, self.acl_filename)
"""

def test_remove_acl(self):
wb_manager(['acl', 'remove', self.acl_filename, 'com,example,'])
Expand Down

0 comments on commit 1addf5d

Please sign in to comment.