From 1985b5d0d7363a23f54ce3a72407c73d559c633f Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 28 Nov 2022 12:48:04 +0100 Subject: [PATCH] Use ckan.datapusher.callback_url_base when downloading data The `ckan.datapusher.callback_url_base` config option can be used when the host where DataPusher is running can not access the public CKAN site URL. It defines an alternative internal host that will be used in the API calls to CKAN that the DataPusher performs. But this was not applied when getting the actual data from an uploaded resource to CKAN, which has a CKAN url. This changes replaces the host in the resource URL if the resource is an upload and a value for `ckan.datapusher.callback_url_base` was provided --- datapusher/jobs.py | 16 ++++++- tests/test_acceptance.py | 91 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/datapusher/jobs.py b/datapusher/jobs.py index 76828b7..8504a90 100644 --- a/datapusher/jobs.py +++ b/datapusher/jobs.py @@ -294,6 +294,13 @@ def get_resource(resource_id, ckan_url, api_key): return r.json()['result'] +def get_data_response(url, **kwargs): + + response = requests.get(url, **kwargs) + + return response + + def validate_input(input): # Especially validate metdata which is provided by the user if 'metadata' not in input: @@ -356,6 +363,13 @@ def push_to_datastore(task_id, input, dry_run=False): 'Only http, https, and ftp resources may be fetched.' ) + # if it's a local upload, check if we need to use an internal host instead of + # the public one on the resource url + if resource.get('url_type') == 'upload': + + if data.get('ckan_url') and url[:url.index('/dataset')] != data['ckan_url'].rstrip('/'): + url = data['ckan_url'].rstrip('/') + url[url.index('/dataset'):] + # fetch the resource data logger.info('Fetching from: {0}'.format(url)) headers = {} @@ -368,7 +382,7 @@ def push_to_datastore(task_id, input, dry_run=False): 'verify': SSL_VERIFY, 'stream': True} if USE_PROXY: kwargs['proxies'] = {'http': DOWNLOAD_PROXY, 'https': DOWNLOAD_PROXY} - response = requests.get(url, **kwargs) + response = get_data_response(url, **kwargs) response.raise_for_status() cl = response.headers.get('content-length') diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 117d0ee..55b516d 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -16,6 +16,10 @@ import httpretty import pytest +try: + import unittest.mock as mock +except ImportError: + import mock import datapusher.main as main import datapusher.jobs as jobs @@ -546,3 +550,90 @@ def test_do_not_push_when_same_hash(self): # res should be None because we didn't get to the part that # returns something assert not res, res + + @httpretty.activate + def test_download_resource_with_callbach_url_base_on_uploads(self): + """ + Rather than re-registering all URLs, we'll consider the www.ckan.org host + the internal host set via ckan.datapusher.callback_url_base, and ckan.example.org + the public host used in the resource URLs + """ + + source_url, res_url = self.register_urls( + source_url='http://ckan.example.org/dataset/some_dataset/resource/{}/download'.format(self.resource_id) + ) + httpretty.register_uri( + httpretty.POST, res_url, + body=json.dumps({ + 'success': True, + 'result': { + 'id': '32h4345k34h5l345', + 'name': 'short name', + 'url': source_url, + 'format': 'csv', + 'url_type': 'upload', + } + }), + content_type='application/json') + + data = { + 'api_key': self.api_key, + 'job_type': 'push_to_datastore', + 'metadata': { + 'ckan_url': 'http://%s/' % self.host, + 'resource_id': self.resource_id + } + } + + with mock.patch('datapusher.jobs.get_data_response') as m: + + m.side_effect = RuntimeError() + try: + res = jobs.push_to_datastore('fake_id', data, True) + except RuntimeError: + + assert m.called + download_url = m.call_args[0][0] + assert download_url[:download_url.index('/dataset')] == 'http://www.ckan.org' + + @httpretty.activate + def test_download_resource_with_callbach_url_base_on_external(self): + """ + Rather than re-registering all URLs, we'll consider the www.ckan.org host + the internal host set via ckan.datapusher.callback_url_base, and ckan.example.org + the public host used in the resource URLs + """ + + source_url, res_url = self.register_urls() + httpretty.register_uri( + httpretty.POST, res_url, + body=json.dumps({ + 'success': True, + 'result': { + 'id': '32h4345k34h5l345', + 'name': 'short name', + 'url': source_url, + 'format': 'csv', + } + }), + content_type='application/json') + + data = { + 'api_key': self.api_key, + 'job_type': 'push_to_datastore', + 'metadata': { + 'ckan_url': 'http://%s/' % self.host, + 'resource_id': self.resource_id + } + } + + with mock.patch('datapusher.jobs.get_data_response') as m: + + m.side_effect = RuntimeError() + try: + res = jobs.push_to_datastore('fake_id', data, True) + except RuntimeError: + + assert m.called + download_url = m.call_args[0][0] + assert download_url.startswith('http://www.source.org')