Skip to content

Commit

Permalink
Use ckan.datapusher.callback_url_base when downloading data
Browse files Browse the repository at this point in the history
The `ckan.datapusher.callback_url_base` config option can be used when
the host where DataPusher is running can not access the public CKAN
site URL. It defines an alternative internal host that will be used in
the API calls to CKAN that the DataPusher performs.
But this was not applied when getting the actual data from an uploaded
resource to CKAN, which has a CKAN url. This changes replaces the host
in the resource URL if the resource is an upload and a value for
`ckan.datapusher.callback_url_base` was provided
  • Loading branch information
amercader committed Nov 28, 2022
1 parent 5dbd4e7 commit 1985b5d
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 1 deletion.
16 changes: 15 additions & 1 deletion datapusher/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,13 @@ def get_resource(resource_id, ckan_url, api_key):
return r.json()['result']


def get_data_response(url, **kwargs):

response = requests.get(url, **kwargs)

return response


def validate_input(input):
# Especially validate metdata which is provided by the user
if 'metadata' not in input:
Expand Down Expand Up @@ -356,6 +363,13 @@ def push_to_datastore(task_id, input, dry_run=False):
'Only http, https, and ftp resources may be fetched.'
)

# if it's a local upload, check if we need to use an internal host instead of
# the public one on the resource url
if resource.get('url_type') == 'upload':

if data.get('ckan_url') and url[:url.index('/dataset')] != data['ckan_url'].rstrip('/'):
url = data['ckan_url'].rstrip('/') + url[url.index('/dataset'):]

# fetch the resource data
logger.info('Fetching from: {0}'.format(url))
headers = {}
Expand All @@ -368,7 +382,7 @@ def push_to_datastore(task_id, input, dry_run=False):
'verify': SSL_VERIFY, 'stream': True}
if USE_PROXY:
kwargs['proxies'] = {'http': DOWNLOAD_PROXY, 'https': DOWNLOAD_PROXY}
response = requests.get(url, **kwargs)
response = get_data_response(url, **kwargs)
response.raise_for_status()

cl = response.headers.get('content-length')
Expand Down
91 changes: 91 additions & 0 deletions tests/test_acceptance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@

import httpretty
import pytest
try:
import unittest.mock as mock
except ImportError:
import mock

import datapusher.main as main
import datapusher.jobs as jobs
Expand Down Expand Up @@ -546,3 +550,90 @@ def test_do_not_push_when_same_hash(self):
# res should be None because we didn't get to the part that
# returns something
assert not res, res

@httpretty.activate
def test_download_resource_with_callbach_url_base_on_uploads(self):
"""
Rather than re-registering all URLs, we'll consider the www.ckan.org host
the internal host set via ckan.datapusher.callback_url_base, and ckan.example.org
the public host used in the resource URLs
"""

source_url, res_url = self.register_urls(
source_url='http://ckan.example.org/dataset/some_dataset/resource/{}/download'.format(self.resource_id)
)
httpretty.register_uri(
httpretty.POST, res_url,
body=json.dumps({
'success': True,
'result': {
'id': '32h4345k34h5l345',
'name': 'short name',
'url': source_url,
'format': 'csv',
'url_type': 'upload',
}
}),
content_type='application/json')

data = {
'api_key': self.api_key,
'job_type': 'push_to_datastore',
'metadata': {
'ckan_url': 'http://%s/' % self.host,
'resource_id': self.resource_id
}
}

with mock.patch('datapusher.jobs.get_data_response') as m:

m.side_effect = RuntimeError()
try:
res = jobs.push_to_datastore('fake_id', data, True)
except RuntimeError:

assert m.called
download_url = m.call_args[0][0]
assert download_url[:download_url.index('/dataset')] == 'http://www.ckan.org'

@httpretty.activate
def test_download_resource_with_callbach_url_base_on_external(self):
"""
Rather than re-registering all URLs, we'll consider the www.ckan.org host
the internal host set via ckan.datapusher.callback_url_base, and ckan.example.org
the public host used in the resource URLs
"""

source_url, res_url = self.register_urls()
httpretty.register_uri(
httpretty.POST, res_url,
body=json.dumps({
'success': True,
'result': {
'id': '32h4345k34h5l345',
'name': 'short name',
'url': source_url,
'format': 'csv',
}
}),
content_type='application/json')

data = {
'api_key': self.api_key,
'job_type': 'push_to_datastore',
'metadata': {
'ckan_url': 'http://%s/' % self.host,
'resource_id': self.resource_id
}
}

with mock.patch('datapusher.jobs.get_data_response') as m:

m.side_effect = RuntimeError()
try:
res = jobs.push_to_datastore('fake_id', data, True)
except RuntimeError:

assert m.called
download_url = m.call_args[0][0]
assert download_url.startswith('http://www.source.org')

0 comments on commit 1985b5d

Please sign in to comment.