From 18f01065efa72679f387fd47a3401be127a95a2f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 May 2021 19:37:24 -0700 Subject: [PATCH 1/5] embargo: add support for per-collection date range embargo with embargo options of 'before', 'after', 'newer' and 'older' 'before' and 'after' accept a timestamp 'newer' and 'older' options configured with a dictionary consisting of any combo of 'years', 'months', 'days' add basic test for each embargo option --- pywb/warcserver/access_checker.py | 97 ++++++++++++++++++++++++++++--- pywb/warcserver/warcserver.py | 6 +- requirements.txt | 1 + tests/config_test_access.yaml | 29 +++++++++ tests/test_embargo.py | 40 +++++++++++++ 5 files changed, 163 insertions(+), 10 deletions(-) create mode 100644 tests/test_embargo.py diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index bcf2a4e01..85735f98c 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -6,6 +6,9 @@ from pywb.utils.binsearch import search from pywb.utils.merge import merge +from warcio.timeutils import timestamp_to_datetime +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta import os @@ -84,11 +87,12 @@ class AccessChecker(object): # another '#' (U+0023 > U+0020) EXACT_SUFFIX_SEARCH_B = b'####' # type: bytes - def __init__(self, access_source, default_access='allow'): + def __init__(self, access_source, default_access='allow', embargo=None): """Initialize a new AccessChecker :param str|list[str]|AccessRulesAggregator access_source: An access source :param str default_access: The default access action (allow) + :param dict embargo: A dict specifying optional embargo setting """ if isinstance(access_source, str): self.aggregator = self.create_access_aggregator([access_source]) @@ -103,6 +107,72 @@ def __init__(self, access_source, default_access='allow'): self.default_rule['access'] = default_access self.default_rule['default'] = 'true' + self.embargo = self.parse_embargo(embargo) + + def parse_embargo(self, embargo): + if not embargo: + return None + + value = embargo.get('before') + if value: + embargo['before'] = timestamp_to_datetime(str(value)) + + value = embargo.get('after') + if value: + embargo['after'] = timestamp_to_datetime(str(value)) + + value = embargo.get('older') + if value: + delta = relativedelta( + years=value.get('years', 0), + months=value.get('months', 0), + weeks=value.get('weeks', 0), + days=value.get('days', 0)) + + embargo['older'] = delta + + value = embargo.get('newer') + if value: + delta = relativedelta( + years=value.get('years', 0), + months=value.get('months', 0), + weeks=value.get('weeks', 0), + days=value.get('days', 0)) + + embargo['newer'] = delta + + return embargo + + def check_embargo(self, url, ts): + if not self.embargo: + return None + + dt = timestamp_to_datetime(ts) + access = self.embargo.get('access', 'exclude') + + # embargo before + before = self.embargo.get('before') + if before: + print(dt, before) + return access if dt < before else None + + # embargo after + after = self.embargo.get('after') + if after: + return access if dt > after else None + + # embargo if newser than + newer = self.embargo.get('newer') + if newer: + actual = datetime.utcnow() - newer + return access if actual < dt else None + + # embargo if older than + older = self.embargo.get('older') + if older: + actual = datetime.utcnow() - older + return access if actual > dt else None + def create_access_aggregator(self, source_files): """Creates a new AccessRulesAggregator using the supplied list of access control file names @@ -209,25 +279,36 @@ def wrap_iter(self, cdx_iter): """ last_rule = None last_url = None + rule = None for cdx in cdx_iter: url = cdx.get('url') + timestamp = cdx.get('timestamp') + # if no url, possible idx or other object, don't apply any checks and pass through if not url: yield cdx continue - # TODO: optimization until date range support is included - if url == last_url: - rule = last_rule - else: - rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey'), - cdx.get('source-coll')) + # first, check embargo and use that setting if any + access = self.check_embargo(url, timestamp) + if not access and self.aggregator: + # TODO: optimization until date range support is included + if url == last_url: + rule = last_rule + else: + rule = self.find_access_rule(url, timestamp, + cdx.get('urlkey'), + cdx.get('source-coll')) + + access = rule.get('access', 'exclude') - access = rule.get('access', 'exclude') if access == 'exclude': continue + if not access: + access = self.default_rule['access'] + cdx['access'] = access yield cdx diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index a67286716..e4abc7fe3 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -210,6 +210,7 @@ def load_coll(self, name, coll_config): archive_paths = None acl_paths = None default_access = self.default_access + embargo = None elif isinstance(coll_config, dict): index = coll_config.get('index') if not index: @@ -217,6 +218,7 @@ def load_coll(self, name, coll_config): archive_paths = coll_config.get('archive_paths') acl_paths = coll_config.get('acl_paths') default_access = coll_config.get('default_access', self.default_access) + embargo = coll_config.get('embargo') else: raise Exception('collection config must be string or dict') @@ -245,8 +247,8 @@ def load_coll(self, name, coll_config): # ACCESS CONFIG access_checker = None - if acl_paths: - access_checker = AccessChecker(acl_paths, default_access) + if acl_paths or embargo: + access_checker = AccessChecker(acl_paths, default_access, embargo) return DefaultResourceHandler(agg, archive_paths, rules_file=self.rules_file, diff --git a/requirements.txt b/requirements.txt index 0c3e0af2d..7f3d3401e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ wsgiprox>=1.5.1 fakeredis<1.0 tldextract babel +python-dateutil diff --git a/tests/config_test_access.yaml b/tests/config_test_access.yaml index 49c4220c3..6e6d7a14a 100644 --- a/tests/config_test_access.yaml +++ b/tests/config_test_access.yaml @@ -24,4 +24,33 @@ collections: default_access: block + pywb-embargo-before: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + before: '2014012700' + + pywb-embargo-after: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + after: '2014012700' + + pywb-embargo-older: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + older: + years: 1 + months: 6 + + pywb-embargo-newer: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + newer: + years: 1 + months: 6 + + diff --git a/tests/test_embargo.py b/tests/test_embargo.py new file mode 100644 index 000000000..4bbf79875 --- /dev/null +++ b/tests/test_embargo.py @@ -0,0 +1,40 @@ +from .base_config_test import BaseConfigTest, fmod + +import webtest +import os + +from six.moves.urllib.parse import urlencode + + +# ============================================================================ +class TestEmbargoApp(BaseConfigTest): + @classmethod + def setup_class(cls): + super(TestEmbargoApp, cls).setup_class('config_test_access.yaml') + + def test_embargo_before(self): + resp = self.testapp.get('/pywb-embargo-before/20140126201054mp_/http://www.iana.org/domains/reserved', status=404) + + resp = self.testapp.get('/pywb-embargo-before/20140127mp_/http://example.com/', status=200) + assert resp.headers['Content-Location'] == 'http://localhost:80/pywb-embargo-before/20140127171251mp_/http://example.com' + + def test_embargo_after(self): + resp = self.testapp.get('/pywb-embargo-after/20140126201054mp_/http://www.iana.org/domains/reserved', status=200) + + resp = self.testapp.get('/pywb-embargo-after/20140127mp_/http://example.com/', status=200) + assert resp.headers['Content-Location'] == 'http://localhost:80/pywb-embargo-after/20130729195151mp_/http://test@example.com/' + + def test_embargo_older(self): + resp = self.testapp.get('/pywb-embargo-older/20140126201054mp_/http://www.iana.org/domains/reserved', status=404) + + resp = self.testapp.get('/pywb-embargo-older/20140127mp_/http://example.com/', status=404) + + def test_embargo_newer(self): + resp = self.testapp.get('/pywb-embargo-newer/20140126201054mp_/http://www.iana.org/domains/reserved', status=200) + + resp = self.testapp.get('/pywb-embargo-newer/20140127mp_/http://example.com/', status=200) + assert resp.headers['Content-Location'] == 'http://localhost:80/pywb-embargo-newer/20140127171251mp_/http://example.com' + + + + From 554b95692e237be50c88a006cc430b59150ffae0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 12 May 2021 14:45:38 -0700 Subject: [PATCH 2/5] acl/embargo work: - support acl access value 'allow_ignore_embargo' for overriding embargo - support 'user' in acl setting, matched with value of 'X-Pywb-ACL-User' header - support passing through 'X-Pywb-ACL-User' setting to warcserver - aclmanager: support -u/--user param for adding, removing and matching rules - tests: add test for 'allow_ignore_embargo', user-specific acl rule matching --- pywb/apps/rewriterapp.py | 2 ++ pywb/manager/aclmanager.py | 23 +++++++----- pywb/rewrite/rewriteinputreq.py | 6 ++++ pywb/warcserver/access_checker.py | 54 +++++++++++++++++++++------- pywb/warcserver/handlers.py | 4 ++- sample_archive/access/pywb.aclj | 5 +++ tests/config_test_access.yaml | 11 ++++++ tests/test_acl.py | 7 ++++ tests/test_acl_manager.py | 58 +++++++++++++++++++++++++++++++ tests/test_embargo.py | 16 +++++++++ 10 files changed, 164 insertions(+), 22 deletions(-) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 66b456072..9df559cd9 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -704,6 +704,8 @@ def _do_req(self, inputreq, wb_url, kwargs, skip_record): headers = {'Content-Length': str(len(req_data)), 'Content-Type': 'application/request'} + headers.update(inputreq.warcserver_headers) + if skip_record: headers['Recorder-Skip'] = '1' diff --git a/pywb/manager/aclmanager.py b/pywb/manager/aclmanager.py index d4ce218bb..3305b9124 100644 --- a/pywb/manager/aclmanager.py +++ b/pywb/manager/aclmanager.py @@ -12,7 +12,7 @@ class ACLManager(CollectionsManager): SURT_RX = re.compile('([^:.]+[,)])+') - VALID_ACCESS = ('allow', 'block', 'exclude') + VALID_ACCESS = ('allow', 'block', 'exclude', 'allow_ignore_embargo') DEFAULT_FILE = 'access-rules.aclj' @@ -167,9 +167,9 @@ def add_rule(self, r): :param argparse.Namespace r: The argparse namespace representing the rule to be added :rtype: None """ - return self._add_rule(r.url, r.access, r.exact_match) + return self._add_rule(r.url, r.access, r.exact_match, r.user) - def _add_rule(self, url, access, exact_match=False): + def _add_rule(self, url, access, exact_match=False, user=None): """Adds an rule to the acl file :param str url: The URL for the rule @@ -185,12 +185,14 @@ def _add_rule(self, url, access, exact_match=False): acl['timestamp'] = '-' acl['access'] = access acl['url'] = url + if user: + acl['user'] = user i = 0 replace = False for rule in self.rules: - if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp']: + if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp'] and acl.get('user') == rule.get('user'): replace = True break @@ -255,7 +257,7 @@ def remove_rule(self, r): i = 0 urlkey = self.to_key(r.url, r.exact_match) for rule in self.rules: - if urlkey == rule['urlkey']: + if urlkey == rule['urlkey'] and r.user == rule.get('user'): acl = self.rules.pop(i) print('Removed Rule:') self.print_rule(acl) @@ -285,7 +287,7 @@ def find_match(self, r): :rtype: None """ access_checker = AccessChecker(self.acl_file, '') - rule = access_checker.find_access_rule(r.url) + rule = access_checker.find_access_rule(r.url, acl_user=r.user) print('Matched rule:') print('') @@ -344,15 +346,18 @@ def command(name, *args, **kwargs): else: op.add_argument(arg) + if kwargs.get('user_opt'): + op.add_argument('-u', '--user') + if kwargs.get('exact_opt'): op.add_argument('-e', '--exact-match', action='store_true', default=False) op.set_defaults(acl_func=kwargs['func']) - command('add', 'coll_name', 'url', 'access', func=cls.add_rule, exact_opt=True) - command('remove', 'coll_name', 'url', func=cls.remove_rule, exact_opt=True) + command('add', 'coll_name', 'url', 'access', func=cls.add_rule, exact_opt=True, user_opt=True) + command('remove', 'coll_name', 'url', func=cls.remove_rule, exact_opt=True, user_opt=True) command('list', 'coll_name', func=cls.list_rules) command('validate', 'coll_name', func=cls.validate_save) - command('match', 'coll_name', 'url', 'default_access', func=cls.find_match) + command('match', 'coll_name', 'url', 'default_access', func=cls.find_match, user_opt=True) command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes) diff --git a/pywb/rewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py index 11d12d92f..6eab1ce08 100644 --- a/pywb/rewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -26,6 +26,7 @@ def __init__(self, env, urlkey, url, rewriter): self.url = url self.rewriter = rewriter self.extra_cookie = None + self.warcserver_headers = {} is_proxy = ('wsgiprox.proxy_host' in env) @@ -82,6 +83,11 @@ def get_req_headers(self): elif name in ('HTTP_IF_MODIFIED_SINCE', 'HTTP_IF_UNMODIFIED_SINCE'): continue + elif name == 'HTTP_X_PYWB_ACL_USER': + name = name[5:].title().replace('_', '-') + self.warcserver_headers[name] = value + continue + elif name == 'HTTP_X_FORWARDED_PROTO': name = 'X-Forwarded-Proto' if self.splits: diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index 85735f98c..183bd05b4 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -209,13 +209,15 @@ def create_access_source(self, filename): else: raise Exception('Invalid Access Source: ' + filename) - def find_access_rule(self, url, ts=None, urlkey=None, collection=None): + def find_access_rule(self, url, ts=None, urlkey=None, collection=None, acl_user=None): """Attempts to find the access control rule for the supplied URL otherwise returns the default rule :param str url: The URL for the rule to be found :param str|None ts: A timestamp (not used) :param str|None urlkey: The access control url key + :param str|None collection: The collection, if any + :param str|None acl_user: The access control user, if any :return: The access control rule for the supplied URL if one exists otherwise the default rule :rtype: CDXObject @@ -237,6 +239,9 @@ def find_access_rule(self, url, ts=None, urlkey=None, collection=None): tld = key.split(b',')[0] + last_obj = None + last_key = None + for acl in acl_iter: # skip empty/invalid lines @@ -244,41 +249,56 @@ def find_access_rule(self, url, ts=None, urlkey=None, collection=None): continue acl_key = acl.split(b' ')[0] + acl_obj = None + + if acl_key != last_key and last_obj: + return last_obj if key_exact == acl_key: - return CDXObject(acl) + acl_obj = CDXObject(acl) if key.startswith(acl_key): - return CDXObject(acl) + acl_obj = CDXObject(acl) + + if acl_obj: + user = acl_obj.get('user') + if user == acl_user: + return acl_obj + elif not user: + last_key = acl_key + last_obj = acl_obj # if acl key already less than first tld, # no match can be found if acl_key < tld: break - return self.default_rule + return last_obj if last_obj else self.default_rule - def __call__(self, res): + def __call__(self, res, acl_user): """Wraps the cdx iter in the supplied tuple returning a the wrapped cdx iter and the other members of the supplied tuple in same order :param tuple res: The result tuple + :param str acl_user: The user associated with this request (optional) :return: An tuple """ cdx_iter, errs = res - return self.wrap_iter(cdx_iter), errs + return self.wrap_iter(cdx_iter, acl_user), errs - def wrap_iter(self, cdx_iter): + def wrap_iter(self, cdx_iter, acl_user): """Wraps the supplied cdx iter and yields cdx objects that contain the access control results for the cdx object being yielded :param cdx_iter: The cdx object iterator to be wrapped + :param str acl_user: The user associated with this request (optional) :return: The wrapped cdx object iterator """ last_rule = None last_url = None + last_user = None rule = None for cdx in cdx_iter: @@ -290,27 +310,37 @@ def wrap_iter(self, cdx_iter): yield cdx continue - # first, check embargo and use that setting if any - access = self.check_embargo(url, timestamp) - if not access and self.aggregator: + access = None + if self.aggregator: # TODO: optimization until date range support is included - if url == last_url: + if url == last_url and acl_user == last_user: rule = last_rule else: rule = self.find_access_rule(url, timestamp, cdx.get('urlkey'), - cdx.get('source-coll')) + cdx.get('source-coll'), + acl_user) access = rule.get('access', 'exclude') + print('ACCESS', access, rule) + if access != 'allow_ignore_embargo' and access != 'exclude': + embargo_access = self.check_embargo(url, timestamp) + if embargo_access and embargo_access != 'allow': + access = embargo_access + if access == 'exclude': continue if not access: access = self.default_rule['access'] + if access == 'allow_ignore_embargo': + access = 'allow' + cdx['access'] = access yield cdx last_rule = rule last_url = url + last_user = acl_user diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 70a2ffc3e..0abd5466c 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -66,8 +66,10 @@ def _load_index_source(self, params): cdx_iter = self.fuzzy(self.index_source, params) + acl_user = params['_input_req'].env.get("HTTP_X_PYWB_ACL_USER") + if self.access_checker: - cdx_iter = self.access_checker(cdx_iter) + cdx_iter = self.access_checker(cdx_iter, acl_user) return cdx_iter diff --git a/sample_archive/access/pywb.aclj b/sample_archive/access/pywb.aclj index 84b7e417e..44382df37 100644 --- a/sample_archive/access/pywb.aclj +++ b/sample_archive/access/pywb.aclj @@ -1,7 +1,12 @@ org,iana)/exact/match/first/line/aclj### - {"access": "allow", "url": "https://www.iana.org/exact/match/first/line/aclj/"} org,iana)/about - {"access": "block"} +org,iana)/about - {"access": "allow", "user": "staff"} org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"} org,iana)/_css - {"access": "exclude"} org,iana)/### - {"access": "allow"} org,iana)/ - {"access": "exclude"} org,example)/?example=1 - {"access": "block"} +com,example)/?example=2 - {"access": "allow_ignore_embargo"} +com,example)/?example=1 - {"access": "allow_ignore_embargo", "user": "staff2"} +com,example)/?example=1 - {"access": "allow", "user": "staff"} +com,example)/ - {"access": "allow"} diff --git a/tests/config_test_access.yaml b/tests/config_test_access.yaml index 6e6d7a14a..8fb352f7c 100644 --- a/tests/config_test_access.yaml +++ b/tests/config_test_access.yaml @@ -52,5 +52,16 @@ collections: years: 1 months: 6 + pywb-embargo-acl: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + older: + years: 1 + + acl_paths: + - ./sample_archive/access/pywb.aclj + + diff --git a/tests/test_acl.py b/tests/test_acl.py index 2554d2e51..5ed532d18 100644 --- a/tests/test_acl.py +++ b/tests/test_acl.py @@ -40,6 +40,13 @@ def test_blocked_url(self): assert 'Access Blocked' in resp.text + def test_allow_via_acl_header(self): + resp = self.query('http://www.iana.org/about/') + + assert len(resp.text.splitlines()) == 1 + + resp = self.testapp.get('/pywb/mp_/http://www.iana.org/about/', headers={"X-Pywb-Acl-User": "staff"}, status=200) + def test_allowed_more_specific(self): resp = self.query('http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf') diff --git a/tests/test_acl_manager.py b/tests/test_acl_manager.py index 16f2239dc..4e732be1c 100644 --- a/tests/test_acl_manager.py +++ b/tests/test_acl_manager.py @@ -40,6 +40,16 @@ def test_acl_add_surt(self): assert fh.read() == """\ com,example, - {"access": "exclude", "url": "com,example,"} com,example)/ - {"access": "allow", "url": "http://example.com/"} +""" + + def test_acl_add_with_user(self): + wb_manager(['acl', 'add', self.acl_filename, 'http://example.com/', 'block', '-u', 'public']) + + with open(self.acl_filename, 'rt') as fh: + assert fh.read() == """\ +com,example, - {"access": "exclude", "url": "com,example,"} +com,example)/ - {"access": "block", "url": "http://example.com/", "user": "public"} +com,example)/ - {"access": "allow", "url": "http://example.com/"} """ def test_acl_list(self, capsys): @@ -51,6 +61,7 @@ def test_acl_list(self, capsys): Rules for %s from %s: com,example, - {"access": "exclude", "url": "com,example,"} +com,example)/ - {"access": "block", "url": "http://example.com/", "user": "public"} com,example)/ - {"access": "allow", "url": "http://example.com/"} """ % (self.acl_filename, self.acl_filename) @@ -71,16 +82,63 @@ def test_acl_match(self, capsys): com,example, - {"access": "exclude", "url": "com,example,"} +""" + + def test_acl_match_user(self, capsys): + wb_manager(['acl', 'match', self.acl_filename, 'http://example.com/foo', '-u', 'public']) + + out, err = capsys.readouterr() + + assert out == """\ +Matched rule: + + com,example)/ - {"access": "block", "url": "http://example.com/", "user": "public"} + +""" + + def test_acl_match_unknown_user(self, capsys): + wb_manager(['acl', 'match', self.acl_filename, 'http://example.com/foo', '-u', 'data']) + + out, err = capsys.readouterr() + + assert out == """\ +Matched rule: + + com,example)/ - {"access": "allow", "url": "http://example.com/"} + +""" + + def test_acl_match_default_user(self, capsys): + wb_manager(['acl', 'match', self.acl_filename, 'http://example.com/foo']) + + out, err = capsys.readouterr() + + assert out == """\ +Matched rule: + + com,example)/ - {"access": "allow", "url": "http://example.com/"} + """ def test_remove_acl(self): wb_manager(['acl', 'remove', self.acl_filename, 'com,example,']) + with open(self.acl_filename, 'rt') as fh: + assert fh.read() == """\ +com,example)/ - {"access": "block", "url": "http://example.com/", "user": "public"} +com,example)/ - {"access": "allow", "url": "http://example.com/"} +""" + + def test_remove_acl_user(self): + wb_manager(['acl', 'remove', self.acl_filename, 'com,example)/', '-u', 'public']) + with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\ com,example)/ - {"access": "allow", "url": "http://example.com/"} """ + + def test_acl_add_exact(self): wb_manager(['acl', 'add', '--exact-match', self.acl_filename, 'example.com', 'block']) diff --git a/tests/test_embargo.py b/tests/test_embargo.py index 4bbf79875..4c1ab21e2 100644 --- a/tests/test_embargo.py +++ b/tests/test_embargo.py @@ -35,6 +35,22 @@ def test_embargo_newer(self): resp = self.testapp.get('/pywb-embargo-newer/20140127mp_/http://example.com/', status=200) assert resp.headers['Content-Location'] == 'http://localhost:80/pywb-embargo-newer/20140127171251mp_/http://example.com' + def test_embargo_ignore_acl(self): + # embargoed + resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/', status=404) + + # ignore embargo + resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=2', status=200) + + + def test_embargo_ignore_acl_with_header_only(self): + # ignore embargo with custom header only + headers = {"X-Pywb-ACL-User": "staff2"} + resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=200, headers=headers) + + resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=404) + + From 1fc43004b9e99c968aef067ee537736d95b13c55 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 May 2021 22:58:34 -0700 Subject: [PATCH 3/5] docs: add docs for new embargo system! --- docs/manual/access-control.rst | 142 ++++++++++++++++++++++++++++-- docs/manual/cdxserver_api.rst | 2 +- pywb/version.py | 2 +- pywb/warcserver/access_checker.py | 1 - 4 files changed, 135 insertions(+), 12 deletions(-) diff --git a/docs/manual/access-control.rst b/docs/manual/access-control.rst index 79e922a98..1da8d54b8 100644 --- a/docs/manual/access-control.rst +++ b/docs/manual/access-control.rst @@ -1,15 +1,84 @@ .. _access-control: -Access Control System ---------------------- +Embargo and Access Control +-------------------------- + +The embargo system allows for date-based rules to block access to captures based on their capture dates. + +The access controls system provides additional URL-based rules to allow, block or exclude access to specific URL prefixes or exact URLs. + +The embargo and access control rules are configured per collection. + +Embargo Settings +================ + +The embargo system allows restricting access to all URLs within a collection based on the timestamp of each URL. +Access to these resources is 'embargoed' until the date range is adjusted or the time interval passes. + +The embargo can be used to disallow access to captures based on following criteria: +- Captures before an exact date +- Captures after an exact date +- Captures newer than a time interval +- Captures older than a time interval + +Embargo Before/After Exact Date +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To block access to all captures before or after a specific date, use the ``before`` or ``after`` embargo blocks +with a specific timestamp. + +For example, the following blocks access to all URLs captured before 2020-12-26 in the collection ``embargo-before``:: + + embargo-before: + index_paths: ... + archive_paths: ... + embargo: + before: '20201226' + + +The following blocks access to all URLs captured on or after 2020-12-26 in collection ``embargo-after``:: + + embargo-after: + index_paths: ... + archive_paths: ... + embargo: + after: '20201226' + +Embargo By Time Interval +^^^^^^^^^^^^^^^^^^^^^^^^ + +The embargo can also be set for a relative time interval, consisting of years, months, weeks and/or days. + + +For example, the following blocks access to all URLs newer than 1 year:: + + embargo-newer: + ... + embargo: + newer: + years: 1 + + + +The following blocks access to all URLs older than 1 year, 2 months, 3 weeks and 4 days:: + + embargo-older: + ... + embargo: + older: + years: 1 + months: 2 + weeks: 3 + days: 4 + + +Any combination of years, months, weeks and days can be used (as long as at least one is provided) for the ``newer`` or ``older`` embargo settings. -The access controls system allows for a flexible configuration of rules to allow, -block or exclude access to individual urls by longest-prefix match. Access Control Files (.aclj) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Access controls are set in one or more access control JSON files (.aclj), sorted in reverse alphabetical order. +URL-based access controls are set in one or more access control JSON files (.aclj), sorted in reverse alphabetical order. To determine the best match, a binary search is used (similar to CDXJ) lookup and then the best match is found forward. An .aclj file may look as follows:: @@ -22,6 +91,8 @@ An .aclj file may look as follows:: Each JSON entry contains an ``access`` field and the original ``url`` field that was used to convert to the SURT (if any). +The JSON entry may also contain a ``user`` field, as explained below. + The prefix consists of a SURT key and a ``-`` (currently reserved for a timestamp/date range field to be added later) Given these rules, a user would: @@ -30,19 +101,53 @@ Given these rules, a user would: * would receive a 404 not found error when viewing ``http://httpbin.org/anything`` (exclude) -Access Types: allow, block, exclude -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Access Types: allow, block, exclude, allow_ignore_embargo +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The available access types are as follows: - ``exclude`` - when matched, results are excluded from the index, as if they do not exist. User will receive a 404. - ``block`` - when matched, results are not excluded from the index, marked with ``access: block``, but access to the actual is blocked. User will see a 451 -- ``allow`` - full access to the index and the resource. +- ``allow`` - full access to the index and the resource, but may be overriden by embargo +- ``allow_ignore_embargo`` - full access to the index and resource, overriding any embargo settings The difference between ``exclude`` and ``block`` is that when blocked, the user can be notified that access is blocked, while with exclude, no trace of the resource is presented to the user. -The use of ``allow`` is useful to provide access to more specific resources within a broader block/exclude rule. +The use of ``allow`` is useful to provide access to more specific resources within a broader block/exclude rule, while ``allow_ignore_embargo`` +can be used to override any embargo settings. + +If both are present, the embargo restrictions are checked first and take precedence, unless the ``allow_ignore_embargo`` option is used +to override the embargo. + + +User-Based Access Controls +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The access control rules can further be customized be specifying different permissions for different 'users'. Since pywb does not have a user system, +a special header, ``X-Pywb-ACL-User`` can be used to indicate a specific user. + +This setting is designed to allow a more priveleged user to access additional setting or override an embargo. + +For example, the following access control settings restricts access to ``https://example.com/restricted/`` by default, but allows access for the ``staff`` user:: + + com,example)/restricted - {"access": "allow", "user": "staff"} + com,example)/restricted - {"access": "block"} + + +Combined with the embargo settings, this can also be used to override the embargo for internal organizational users, while keeping the embargo for general access:: + + com,example)/restricted - {"access": "allow_ignore_embargo", "user": "staff"} + com,example)/restricted - {"access": "allow"} + +To make this work, pywb must be running behind an Apache or Nginx system that is configured to set ``X-Pywb-ACL-User: staff`` based on certain settings. + +For example, this header may be set based on IP range, or based on password authentication. + +Further examples of how to set this header will be provided in the deployments section. + +**Note: DON'T use custom user-based rules without configuring proper authentication on the Apache or Nginx frontend to set or remove this header, otherwise the ``X-Pywb-ACL-User`` can easily be faked** + Access Error Messages ^^^^^^^^^^^^^^^^^^^^^ @@ -73,6 +178,11 @@ The URL supplied can be a URL or a SURT prefix. If a SURT is supplied, it is use wb-manager acl add com, allow +A specific user for user-based rules can also be specified, for example to add ``allow_ignore_embargo`` for user ``staff`` only, run:: + + wb-manager acl add http://httpbin.org/anything/something allow_ignore_embargo staff + + By default, access control rules apply to a prefix of a given URL or SURT. To have the rule apply only to the exact match, use:: @@ -136,6 +246,20 @@ set merge-sorted to find the best match (very similar to the CDXJ index lookup). Note: It might make sense to separate ``allows.aclj`` and ``blocks.aclj`` into individual files for organizational reasons, but there is no specific need to keep more than one access control files. +Finally, ACLJ and embargo settings combined for the same collection might look as follows:: + + collections: + test: + ... + embargo: + newer: + days: 366 + + acl_paths: + - ./path/to/allows.aclj + - ./path/to/blocks.aclj + + Default Access ^^^^^^^^^^^^^^ diff --git a/docs/manual/cdxserver_api.rst b/docs/manual/cdxserver_api.rst index 54ea7332f..66b5108ef 100644 --- a/docs/manual/cdxserver_api.rst +++ b/docs/manual/cdxserver_api.rst @@ -182,7 +182,7 @@ the following modifiers: ``fields`` -^^^^^^ +^^^^^^^^^^ The ``fields`` param can be used to specify which fields to include in the output. The standard available fields are usually: ``urlkey``, diff --git a/pywb/version.py b/pywb/version.py index ff40f4b4a..844b041d4 100644 --- a/pywb/version.py +++ b/pywb/version.py @@ -1,4 +1,4 @@ -__version__ = '2.6.0.dev0' +__version__ = '2.6.0b0' if __name__ == '__main__': print(__version__) diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index 183bd05b4..46cd7acd9 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -323,7 +323,6 @@ def wrap_iter(self, cdx_iter, acl_user): access = rule.get('access', 'exclude') - print('ACCESS', access, rule) if access != 'allow_ignore_embargo' and access != 'exclude': embargo_access = self.check_embargo(url, timestamp) if embargo_access and embargo_access != 'allow': From cae2c2406ca833e2664c0d9769b59ccb420a08cd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 May 2021 15:26:01 -0700 Subject: [PATCH 4/5] docs: add info on how to configure ACL header with short examples to usage page. sample-deploy: add examples of configuring X-pywb-ACL-user header based on IP for nginx and apache sample deployments --- docs/manual/access-control.rst | 4 +++- docs/manual/usage.rst | 44 ++++++++++++++++++++++++++++++++++ sample-deploy/pywb-apache.conf | 9 +++++++ sample-deploy/pywb-nginx.conf | 17 +++++++++++++ 4 files changed, 73 insertions(+), 1 deletion(-) diff --git a/docs/manual/access-control.rst b/docs/manual/access-control.rst index 1da8d54b8..e287e42b3 100644 --- a/docs/manual/access-control.rst +++ b/docs/manual/access-control.rst @@ -146,7 +146,9 @@ For example, this header may be set based on IP range, or based on password auth Further examples of how to set this header will be provided in the deployments section. -**Note: DON'T use custom user-based rules without configuring proper authentication on the Apache or Nginx frontend to set or remove this header, otherwise the ``X-Pywb-ACL-User`` can easily be faked** +**Note: Do not use the user-based rules without configuring proper authentication on an Apache or Nginx frontend to set or remove this header, otherwise the 'X-Pywb-ACL-User' can easily be faked.** + +See the :ref:`config-acl-header` section in Usage for examples on how to configure this header. Access Error Messages diff --git a/docs/manual/usage.rst b/docs/manual/usage.rst index 7ee73df2c..647917b67 100644 --- a/docs/manual/usage.rst +++ b/docs/manual/usage.rst @@ -293,6 +293,50 @@ Then, in your config, simply include: The configuration assumes uwsgi is started with ``uwsgi uwsgi.ini`` +.. _config-acl-header: + +Configuring Access Control Header +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :ref:`access-control` system allows users to be granted different access settings based on the value of an ACL header, ``X-pywb-ACL-user``. + +The header can be set by Nginx or Apache to grant custom access priviliges based on the IP address, auth settings, or a combination of rules. + +For example, to set the value of the header to ``staff`` if the IP of the request is from designated local IP ranges (127.0.0.1, 192.168.1.0/24), the following can be added to the configs. + +For Nginx:: + + geo $acl_user { + # ensure user is set to empty by default + default ""; + + # optional: add IP ranges to allow privileged access + 127.0.0.1 "staff"; + 192.168.0.0/24 "staff"; + } + + ... + location /wayback/ { + ... + uwsgi_param HTTP_X_PYWB_ACL_USER $acl_user; + } + + +For Apache:: + + + RequestHeader set X-Pywb-ACL-User staff + + # ensure header is cleared if no match + + RequestHeader set X-Pywb-ACL-User "" + + +} + + + + Running on Subdirectory Path ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/sample-deploy/pywb-apache.conf b/sample-deploy/pywb-apache.conf index 1fdfab51c..96c5ff5b2 100644 --- a/sample-deploy/pywb-apache.conf +++ b/sample-deploy/pywb-apache.conf @@ -14,4 +14,13 @@ # required: proxy pass to pywb ProxyPass /wayback uwsgi://pywb:8081/ + # optional: set custom header based on IP ranges + + RequestHeader set X-Pywb-ACL-User staff + + # ensure header is cleared if no match + + RequestHeader set X-Pywb-ACL-User "" + + diff --git a/sample-deploy/pywb-nginx.conf b/sample-deploy/pywb-nginx.conf index dd22ea698..23c553379 100644 --- a/sample-deploy/pywb-nginx.conf +++ b/sample-deploy/pywb-nginx.conf @@ -1,5 +1,18 @@ # nginx config for running under /wayback/ prefix + +# set acl_user, defaulting to empty (any public user) +geo $acl_user { + # ensure user is set to empty by default + default ""; + + # optional: add IP ranges to allow privileged access + 127.0.0.1 "staff"; + 192.168.0.0/24 "staff"; +} + + + server { listen 80; @@ -14,8 +27,12 @@ server { uwsgi_pass pywb:8081; + include uwsgi_params; uwsgi_param UWSGI_SCHEME $scheme; + + # pass acl_user (which should be empty by default) + uwsgi_param HTTP_X_PYWB_ACL_USER $acl_user; } } From 234ee075230b74710dc60669c1d87b15dc351cdf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 May 2021 15:33:11 -0700 Subject: [PATCH 5/5] docs: fix access control page header, text tweaks --- docs/manual/access-control.rst | 3 +++ docs/manual/usage.rst | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/manual/access-control.rst b/docs/manual/access-control.rst index e287e42b3..a9fd89b8a 100644 --- a/docs/manual/access-control.rst +++ b/docs/manual/access-control.rst @@ -75,6 +75,9 @@ The following blocks access to all URLs older than 1 year, 2 months, 3 weeks and Any combination of years, months, weeks and days can be used (as long as at least one is provided) for the ``newer`` or ``older`` embargo settings. +Access Control Settings +======================= + Access Control Files (.aclj) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/manual/usage.rst b/docs/manual/usage.rst index 647917b67..d322ad20d 100644 --- a/docs/manual/usage.rst +++ b/docs/manual/usage.rst @@ -300,9 +300,9 @@ Configuring Access Control Header The :ref:`access-control` system allows users to be granted different access settings based on the value of an ACL header, ``X-pywb-ACL-user``. -The header can be set by Nginx or Apache to grant custom access priviliges based on the IP address, auth settings, or a combination of rules. +The header can be set via Nginx or Apache to grant custom access priviliges based on IP address, password, or other combination of rules. -For example, to set the value of the header to ``staff`` if the IP of the request is from designated local IP ranges (127.0.0.1, 192.168.1.0/24), the following can be added to the configs. +For example, to set the value of the header to ``staff`` if the IP of the request is from designated local IP ranges (127.0.0.1, 192.168.1.0/24), the following settings can be added to the configs: For Nginx::