diff --git a/.travis.yml b/.travis.yml index 03297afa..5f109b13 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,22 +1,44 @@ -language: python - -# the new trusty images of Travis cause build errors with psycopg2, see https://github.com/travis-ci/travis-ci/issues/8897 dist: trusty -group: deprecated-2017Q4 -python: - - "2.7" -env: - # - CKANVERSION=master - - CKANVERSION=2.8 - - CKANVERSION=2.7 +os: linux +language: python + install: - bash bin/travis-build.bash services: - - postgresql - redis -addons: - postgresql: "9.3" -script: sh bin/travis-run.sh -after_success: coveralls -sudo: required + - postgresql + +script: bash bin/travis-run.bash +before_install: + - pip install codecov +after_success: + - codecov + +jobs: + include: + - stage: Flake8 + python: 2.7 + env: FLAKE8=True + install: + - pip install flake8==3.5.0 + - pip install pycodestyle==2.3.0 + script: + - flake8 --version + # stop the build if there are Python syntax errors or undefined names + - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --exclude ckan + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + # - flake8 . --count --max-line-length=127 --statistics --exclude ckan --exit-zero + - stage: Tests + python: "2.7" + env: CKANVERSION=master + - python: "3.6" + env: CKANVERSION=master + - python: "2.7" + env: CKANVERSION=2.8 + - python: "2.7" + env: CKANVERSION=2.7 + +cache: + directories: + - $HOME/.cache/pip diff --git a/bin/travis-run.sh b/bin/travis-run.bash similarity index 58% rename from bin/travis-run.sh rename to bin/travis-run.bash index 880eb676..7a4e5aca 100644 --- a/bin/travis-run.sh +++ b/bin/travis-run.bash @@ -5,14 +5,7 @@ flake8 --version # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --exclude ckan,ckanext-xloader -nosetests --ckan \ - --nologcapture \ - --with-pylons=subdir/test.ini \ - --with-coverage \ - --cover-package=ckanext.xloader \ - --cover-inclusive \ - --cover-erase \ - --cover-tests +pytest --ckan-ini subdir/test.ini --cov=ckanext.xloader ckanext/xloader/tests # strict linting flake8 . --count --max-complexity=27 --max-line-length=127 --statistics --exclude ckan,ckanext-xloader diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py index 62cf20a5..f5f6609d 100644 --- a/ckanext/xloader/action.py +++ b/ckanext/xloader/action.py @@ -1,20 +1,23 @@ # encoding: utf-8 +from __future__ import absolute_import +from builtins import str import logging import json import datetime from dateutil.parser import parse as parse_date +import ckan.model as model import ckan.lib.navl.dictization_functions import ckan.logic as logic import ckan.plugins as p from ckan.logic import side_effect_free import ckanext.xloader.schema -import interfaces as xloader_interfaces -import jobs -import db +from . import interfaces as xloader_interfaces +from . import jobs +from . import db try: enqueue_job = p.toolkit.enqueue_job except AttributeError: @@ -108,7 +111,7 @@ def xloader_submit(context, data_dict): if existing_task.get('state') == 'pending': import re # here because it takes a moment to load queued_res_ids = [ - re.search(r"'resource_id': u'([^']+)'", + re.search(r"'resource_id': u?'([^']+)'", job.description).groups()[0] for job in get_queue().get_jobs() if 'xloader_to_datastore' in str(job) # filter out test_job etc @@ -143,6 +146,10 @@ def xloader_submit(context, data_dict): context['ignore_auth'] = True context['user'] = '' # benign - needed for ckan 2.5 + + model = context['model'] + original_session = model.Session + model.Session = model.meta.create_local_session() p.toolkit.get_action('task_status_update')(context, task) data = { @@ -168,6 +175,7 @@ def xloader_submit(context, data_dict): job = _enqueue(jobs.xloader_data_into_datastore, [data], timeout=timeout) except Exception: log.exception('Unable to enqueued xloader res_id=%s', res_id) + model.Session = original_session return False log.debug('Enqueued xloader job=%s res_id=%s', job.id, res_id) @@ -177,6 +185,7 @@ def xloader_submit(context, data_dict): task['state'] = 'pending' task['last_updated'] = str(datetime.datetime.utcnow()), p.toolkit.get_action('task_status_update')(context, task) + model.Session = original_session return True diff --git a/ckanext/xloader/cli.py b/ckanext/xloader/cli.py index 54fb43a0..b5a347b1 100644 --- a/ckanext/xloader/cli.py +++ b/ckanext/xloader/cli.py @@ -1,3 +1,4 @@ +from __future__ import print_function import sys import logging @@ -62,7 +63,7 @@ def __init__(self, name): def command(self): if not self.args: - print self.usage + print(self.usage) sys.exit(1) if self.args[0] == 'submit': if len(self.args) < 2: @@ -115,7 +116,7 @@ def _confirm_or_abort(self): ) answer = cli.query_yes_no(question, default=None) if not answer == 'yes': - print "Aborting..." + print("Aborting...") sys.exit(0) def _submit_all_existing(self): @@ -281,7 +282,7 @@ def command(self): def _migrate_all(self): session = model.Session resource_count = session.query(model.Resource).filter_by(state='active').count() - print "Updating {} resource(s)".format(resource_count) + print("Updating {} resource(s)".format(resource_count)) resources_done = 0 for resource in session.query(model.Resource).filter_by(state='active'): resources_done += 1 @@ -289,15 +290,15 @@ def _migrate_all(self): prefix='[{}/{}]: '.format(resources_done, resource_count)) if resources_done % 100 == 0: - print "[{}/{}] done".format(resources_done, resource_count) - print "[{}/{}] done".format(resources_done, resource_count) + print("[{}/{}] done".format(resources_done, resource_count)) + print("[{}/{}] done".format(resources_done, resource_count)) def _migrate_resource(self, resource_id, prefix=''): data_dict = h.datastore_dictionary(resource_id) def print_status(status): if self.options.verbose: - print "{}{}: {}".format(prefix, resource_id, status) + print("{}{}: {}".format(prefix, resource_id, status)) if not data_dict: print_status("not found") @@ -333,9 +334,9 @@ def print_status(status): 'fields': fields }) print_status("updated") - except Exception, e: + except Exception as e: self.error_occured = True - print "{}: failed, {}".format(resource_id, e) + print("{}: failed, {}".format(resource_id, e)) def _handle_command_status(self): if self.error_occured: diff --git a/ckanext/xloader/controllers.py b/ckanext/xloader/controllers.py index e7395c38..a1ab3c3f 100644 --- a/ckanext/xloader/controllers.py +++ b/ckanext/xloader/controllers.py @@ -1,50 +1,7 @@ import ckan.plugins as p - -_ = p.toolkit._ +import ckanext.xloader.utils as utils class ResourceDataController(p.toolkit.BaseController): - def resource_data(self, id, resource_id): - - if p.toolkit.request.method == 'POST': - try: - p.toolkit.c.pkg_dict = \ - p.toolkit.get_action('xloader_submit')( - None, {'resource_id': resource_id} - ) - except p.toolkit.ValidationError: - pass - - p.toolkit.redirect_to( - controller='ckanext.xloader.controllers:ResourceDataController', - action='resource_data', - id=id, - resource_id=resource_id - ) - - try: - p.toolkit.c.pkg_dict = p.toolkit.get_action('package_show')( - None, {'id': id} - ) - p.toolkit.c.resource = p.toolkit.get_action('resource_show')( - None, {'id': resource_id} - ) - except (p.toolkit.ObjectNotFound, p.toolkit.NotAuthorized): - p.toolkit.abort(404, _('Resource not found')) - - try: - xloader_status = p.toolkit.get_action('xloader_status')( - None, {'resource_id': resource_id} - ) - except p.toolkit.ObjectNotFound: - xloader_status = {} - except p.toolkit.NotAuthorized: - p.toolkit.abort(403, _('Not authorized to see this page')) - - return p.toolkit.render('xloader/resource_data.html', - extra_vars={ - 'status': xloader_status, - 'resource': p.toolkit.c.resource, - 'pkg_dict': p.toolkit.c.pkg_dict, - }) + return utils.resource_data(id, resource_id) diff --git a/ckanext/xloader/db.py b/ckanext/xloader/db.py index 8d7742f5..6b033b0b 100644 --- a/ckanext/xloader/db.py +++ b/ckanext/xloader/db.py @@ -8,6 +8,7 @@ import datetime import json +import six import sqlalchemy @@ -108,7 +109,7 @@ def get_job(job_id): # Avoid SQLAlchemy "Unicode type received non-unicode bind param value" # warnings. if job_id: - job_id = unicode(job_id) + job_id = six.text_type(job_id) result = ENGINE.execute( JOBS_TABLE.select().where(JOBS_TABLE.c.job_id == job_id)).first() @@ -118,7 +119,7 @@ def get_job(job_id): # Turn the result into a dictionary representation of the job. result_dict = {} - for field in result.keys(): + for field in list(result.keys()): value = getattr(result, field) if value is None: result_dict[field] = value @@ -127,7 +128,7 @@ def get_job(job_id): elif isinstance(value, datetime.datetime): result_dict[field] = value.isoformat() else: - result_dict[field] = unicode(value) + result_dict[field] = six.text_type(value) result_dict['metadata'] = _get_metadata(job_id) result_dict['logs'] = _get_logs(job_id) @@ -178,14 +179,14 @@ def add_pending_job(job_id, job_type, api_key, # Turn strings into unicode to stop SQLAlchemy # "Unicode type received non-unicode bind param value" warnings. if job_id: - job_id = unicode(job_id) + job_id = six.text_type(job_id) if job_type: - job_type = unicode(job_type) + job_type = six.text_type(job_type) if result_url: - result_url = unicode(result_url) + result_url = six.text_type(result_url) if api_key: - api_key = unicode(api_key) - data = unicode(data) + api_key = six.text_type(api_key) + data = six.text_type(data) if not metadata: metadata = {} @@ -205,16 +206,16 @@ def add_pending_job(job_id, job_type, api_key, # Insert any (key, value) metadata pairs that the job has into the # metadata table. inserts = [] - for key, value in metadata.items(): + for key, value in list(metadata.items()): type_ = 'string' - if not isinstance(value, basestring): + if not isinstance(value, six.string_types): value = json.dumps(value) type_ = 'json' # Turn strings into unicode to stop SQLAlchemy # "Unicode type received non-unicode bind param value" warnings. - key = unicode(key) - value = unicode(value) + key = six.text_type(key) + value = six.text_type(value) inserts.append( {"job_id": job_id, @@ -261,12 +262,12 @@ def _validate_error(error): """ if error is None: return None - elif isinstance(error, basestring): + elif isinstance(error, six.string_types): return {"message": error} else: try: message = error["message"] - if isinstance(message, basestring): + if isinstance(message, six.string_types): return error else: raise InvalidErrorObjectError( @@ -291,19 +292,19 @@ def _update_job(job_id, job_dict): # Avoid SQLAlchemy "Unicode type received non-unicode bind param value" # warnings. if job_id: - job_id = unicode(job_id) + job_id = six.text_type(job_id) if "error" in job_dict: job_dict["error"] = _validate_error(job_dict["error"]) job_dict["error"] = json.dumps(job_dict["error"]) # Avoid SQLAlchemy "Unicode type received non-unicode bind param value" # warnings. - job_dict["error"] = unicode(job_dict["error"]) + job_dict["error"] = six.text_type(job_dict["error"]) # Avoid SQLAlchemy "Unicode type received non-unicode bind param value" # warnings. if "data" in job_dict: - job_dict["data"] = unicode(job_dict["data"]) + job_dict["data"] = six.text_type(job_dict["data"]) ENGINE.execute( JOBS_TABLE.update() @@ -448,7 +449,7 @@ def _get_metadata(job_id): """Return any metadata for the given job_id from the metadata table.""" # Avoid SQLAlchemy "Unicode type received non-unicode bind param value" # warnings. - job_id = unicode(job_id) + job_id = six.text_type(job_id) results = ENGINE.execute( METADATA_TABLE.select().where( @@ -466,7 +467,7 @@ def _get_logs(job_id): """Return any logs for the given job_id from the logs table.""" # Avoid SQLAlchemy "Unicode type received non-unicode bind param value" # warnings. - job_id = unicode(job_id) + job_id = six.text_type(job_id) results = ENGINE.execute( LOGS_TABLE.select().where(LOGS_TABLE.c.job_id == job_id)).fetchall() diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py index e380d361..e7d30ac1 100644 --- a/ckanext/xloader/jobs.py +++ b/ckanext/xloader/jobs.py @@ -1,28 +1,32 @@ +from __future__ import division +from __future__ import absolute_import import math import logging import hashlib import time import tempfile import json -import urlparse import datetime import traceback import sys +import six +from six.moves.urllib.parse import urlsplit import requests from rq import get_current_job import sqlalchemy as sa -from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound +import ckan.model as model +from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, c try: from ckan.plugins.toolkit import config except ImportError: from pylons import config import ckan.lib.search as search -import loader -import db -from job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError +from . import loader +from . import db +from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError if config.get('ckanext.xloader.ssl_verify') in ['False', 'FALSE', '0', False, 0]: SSL_VERIFY = False @@ -84,7 +88,7 @@ def xloader_data_into_datastore(input): errored = True except Exception as e: db.mark_job_as_errored( - job_id, traceback.format_tb(sys.exc_traceback)[-1] + repr(e)) + job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e)) job_dict['status'] = 'error' job_dict['error'] = str(e) log = logging.getLogger(__name__) @@ -135,13 +139,12 @@ def xloader_data_into_datastore_(input, job_dict): ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') - try: - resource, dataset = get_resource_and_dataset(resource_id) + resource, dataset = get_resource_and_dataset(resource_id, api_key) except (JobError, ObjectNotFound) as e: # try again in 5 seconds just in case CKAN is slow at adding resource time.sleep(5) - resource, dataset = get_resource_and_dataset(resource_id) + resource, dataset = get_resource_and_dataset(resource_id, api_key) resource_ckan_url = '/dataset/{}/resource/{}' \ .format(dataset['name'], resource['id']) logger.info('Express Load starting: {}'.format(resource_ckan_url)) @@ -244,7 +247,7 @@ def _download_resource_data(resource, data, api_key, logger): ''' # check scheme url = resource.get('url') - scheme = urlparse.urlsplit(url).scheme + scheme = urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise JobError( 'Only http, https, and ftp resources may be fetched.' @@ -482,12 +485,18 @@ def update_resource(resource, patch_only=False): get_action(action)(context, resource) -def get_resource_and_dataset(resource_id): +def get_resource_and_dataset(resource_id, api_key): """ Gets available information about the resource and its dataset from CKAN """ - res_dict = get_action('resource_show')(None, {'id': resource_id}) - pkg_dict = get_action('package_show')(None, {'id': res_dict['package_id']}) + user = model.Session.query(model.User).filter_by( + apikey=api_key).first() + if user is not None: + context = {'user': user.name} + else: + context = None + res_dict = get_action('resource_show')(context, {'id': resource_id}) + pkg_dict = get_action('package_show')(context, {'id': res_dict['package_id']}) return res_dict, pkg_dict @@ -495,7 +504,7 @@ def get_url(action, ckan_url): """ Get url for ckan action """ - if not urlparse.urlsplit(ckan_url).scheme: + if not urlsplit(ckan_url).scheme: ckan_url = 'http://' + ckan_url.lstrip('/') ckan_url = ckan_url.rstrip('/') return '{ckan_url}/api/3/action/{action}'.format( @@ -552,10 +561,10 @@ def emit(self, record): try: # Turn strings into unicode to stop SQLAlchemy # "Unicode type received non-unicode bind param value" warnings. - message = unicode(record.getMessage()) - level = unicode(record.levelname) - module = unicode(record.module) - funcName = unicode(record.funcName) + message = six.text_type(record.getMessage()) + level = six.text_type(record.levelname) + module = six.text_type(record.module) + funcName = six.text_type(record.funcName) conn.execute(db.LOGS_TABLE.insert().values( job_id=self.task_id, @@ -584,5 +593,5 @@ def printable_file_size(size_bytes): size_name = ('bytes', 'KB', 'MB', 'GB', 'TB') i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) - s = round(size_bytes / p, 1) + s = round(float(size_bytes) / p, 1) return "%s %s" % (s, size_name[i]) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 8cbd304e..e70730a6 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -1,16 +1,20 @@ 'Load a CSV into postgres' +from __future__ import absolute_import +from builtins import zip +from builtins import str import os import os.path import tempfile import itertools import csv +import six import psycopg2 import messytables from unidecode import unidecode import ckan.plugins as p -from job_exceptions import LoaderError, FileCouldNotBeLoadedError +from .job_exceptions import LoaderError, FileCouldNotBeLoadedError import ckan.plugins.toolkit as tk try: from ckan.plugins.toolkit import config @@ -61,17 +65,19 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): if not table_set.tables: raise LoaderError('Could not detect tabular data in this file') row_set = table_set.tables.pop() - header_offset, headers = messytables.headers_guess(row_set.sample) - + try: + header_offset, headers = messytables.headers_guess(row_set.sample) + except messytables.ReadError as e: + raise LoaderError('Messytables error: {}'.format(e)) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) # Guess the delimiter used in the file - with open(csv_filepath, 'r') as f: + with open(csv_filepath, 'rb') as f: header_line = f.readline() try: sniffer = csv.Sniffer() - delimiter = sniffer.sniff(header_line).delimiter + delimiter = sniffer.sniff(six.ensure_text(header_line)).delimiter except csv.Error: logger.warning('Could not determine delimiter from file, use default ","') delimiter = ',' @@ -351,7 +357,7 @@ def row_iterator(): h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') - if type_override in _TYPE_MAPPING.values(): + if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index a595eba0..7efa8a17 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -43,10 +43,25 @@ class xloaderPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IResourceUrlChange) plugins.implements(plugins.IActions) plugins.implements(plugins.IAuthFunctions) - plugins.implements(plugins.IRoutes, inherit=True) plugins.implements(plugins.ITemplateHelpers) plugins.implements(plugins.IResourceController, inherit=True) + if toolkit.check_ckan_version('2.9'): + plugins.implements(plugins.IBlueprint) + # IBlueprint + def get_blueprint(self): + from ckanext.xloader.views import get_blueprints + return get_blueprints() + else: + plugins.implements(plugins.IRoutes, inherit=True) + # IRoutes + def before_map(self, m): + m.connect( + 'xloader.resource_data', '/dataset/{id}/resource_data/{resource_id}', + controller='ckanext.xloader.controllers:ResourceDataController', + action='resource_data', ckan_icon='cloud-upload') + return m + # IResourceController def before_show(self, resource_dict): @@ -112,13 +127,13 @@ def notify(self, entity, operation=None): 'would be circular.'.format(r=entity)) return - # try: - # task = p.toolkit.get_action('task_status_show')( - # context, { - # 'entity_id': entity.id, - # 'task_type': 'datapusher', - # 'key': 'datapusher'} - # ) + try: + task = p.toolkit.get_action('task_status_show')( + context, { + 'entity_id': entity.id, + 'task_type': 'xloader', + 'key': 'xloader'} + ) # if task.get('state') == 'pending': # # There already is a pending DataPusher submission, # # skip this one ... @@ -126,8 +141,8 @@ def notify(self, entity, operation=None): # 'Skipping DataPusher submission for ' # 'resource {0}'.format(entity.id)) # return - # except p.toolkit.ObjectNotFound: - # pass + except p.toolkit.ObjectNotFound: + pass try: log.debug('Submitting resource {0} to be xloadered' @@ -159,15 +174,6 @@ def get_auth_functions(self): 'xloader_status': auth.xloader_status, } - # IRoutes - - def before_map(self, m): - m.connect( - 'resource_data_xloader', '/dataset/{id}/resource_data/{resource_id}', - controller='ckanext.xloader.controllers:ResourceDataController', - action='resource_data', ckan_icon='cloud-upload') - return m - # ITemplateHelpers def get_helpers(self): diff --git a/ckanext/xloader/schema.py b/ckanext/xloader/schema.py index ad1029b1..773fae5a 100644 --- a/ckanext/xloader/schema.py +++ b/ckanext/xloader/schema.py @@ -16,7 +16,7 @@ def xloader_submit_schema(): schema = { - 'resource_id': [not_missing, not_empty, unicode], + 'resource_id': [not_missing, not_empty, str], 'id': [ignore_missing], 'set_url_type': [ignore_missing, boolean_validator], 'ignore_hash': [ignore_missing, boolean_validator], diff --git a/ckanext/xloader/templates-bs2/package/resource_edit_base.html b/ckanext/xloader/templates-bs2/package/resource_edit_base.html index 73b6f776..34403521 100644 --- a/ckanext/xloader/templates-bs2/package/resource_edit_base.html +++ b/ckanext/xloader/templates-bs2/package/resource_edit_base.html @@ -1,6 +1,6 @@ {% ckan_extends %} {% block inner_primary_nav %} - {{ super() }} - {{ h.build_nav_icon('resource_data_xloader', _('DataStore'), id=pkg.name, resource_id=res.id) }} + {{ super() }} + {{ h.build_nav_icon('xloader.resource_data', _('DataStore'), id=pkg.name, resource_id=res.id) }} {% endblock %} diff --git a/ckanext/xloader/templates-bs2/xloader/resource_data.html b/ckanext/xloader/templates-bs2/xloader/resource_data.html index 698ea85e..ace37859 100644 --- a/ckanext/xloader/templates-bs2/xloader/resource_data.html +++ b/ckanext/xloader/templates-bs2/xloader/resource_data.html @@ -25,7 +25,7 @@ {{ _('Error:') }} {{ status.task_info.error }} {% elif status.task_info.error is mapping %} {{ _('Error:') }} {{ status.task_info.error.message }} - {% for error_key, error_value in status.task_info.error.iteritems() %} + {% for error_key, error_value in status.task_info.error.items() %} {% if error_key != "message" and error_value %}
{{ error_key }}: @@ -73,7 +73,7 @@

{{ _('Upload Log') }}

{% endfor %} {{ h.time_ago_from_timestamp(item.timestamp) }} - {{ _('Details') }} + {{ _('Details') }}

diff --git a/ckanext/xloader/templates/package/resource_edit_base.html b/ckanext/xloader/templates/package/resource_edit_base.html index 73b6f776..34403521 100644 --- a/ckanext/xloader/templates/package/resource_edit_base.html +++ b/ckanext/xloader/templates/package/resource_edit_base.html @@ -1,6 +1,6 @@ {% ckan_extends %} {% block inner_primary_nav %} - {{ super() }} - {{ h.build_nav_icon('resource_data_xloader', _('DataStore'), id=pkg.name, resource_id=res.id) }} + {{ super() }} + {{ h.build_nav_icon('xloader.resource_data', _('DataStore'), id=pkg.name, resource_id=res.id) }} {% endblock %} diff --git a/ckanext/xloader/templates/xloader/resource_data.html b/ckanext/xloader/templates/xloader/resource_data.html index 55a71be4..54698b51 100644 --- a/ckanext/xloader/templates/xloader/resource_data.html +++ b/ckanext/xloader/templates/xloader/resource_data.html @@ -4,7 +4,7 @@ {% block primary_content_inner %} - {% set action = h.url_for(controller='ckanext.xloader.controllers:ResourceDataController', action='resource_data', id=pkg.name, resource_id=res.id) %} + {% set action = h.url_for('xloader.resource_data', id=pkg.name, resource_id=res.id) %} {% set show_table = true %}
@@ -25,7 +25,7 @@ {{ _('Error:') }} {{ status.task_info.error }} {% elif status.task_info.error is mapping %} {{ _('Error:') }} {{ status.task_info.error.message }} - {% for error_key, error_value in status.task_info.error.iteritems() %} + {% for error_key, error_value in status.task_info.error.items() %} {% if error_key != "message" and error_value %}
{{ error_key }}: @@ -73,7 +73,7 @@

{{ _('Upload Log') }}

{% endfor %} {{ h.time_ago_from_timestamp(item.timestamp) }} - {{ _('Details') }} + {{ _('Details') }}

diff --git a/ckanext/xloader/tests/ckan_setup.py b/ckanext/xloader/tests/ckan_setup.py new file mode 100644 index 00000000..a953c22f --- /dev/null +++ b/ckanext/xloader/tests/ckan_setup.py @@ -0,0 +1,38 @@ +try: + from ckan.tests.pytest_ckan.ckan_setup import * +except ImportError: + from ckan.config.middleware import make_app + from ckan.common import config + + import pkg_resources + from paste.deploy import loadapp + import sys + import os + + import pylons + from pylons.i18n.translation import _get_translator + + def pytest_addoption(parser): + """Allow using custom config file during tests. + """ + parser.addoption(u"--ckan-ini", action=u"store") + + def pytest_sessionstart(session): + """Initialize CKAN environment. + """ + global pylonsapp + path = os.getcwd() + sys.path.insert(0, path) + pkg_resources.working_set.add_entry(path) + pylonsapp = loadapp( + "config:" + session.config.option.ckan_ini, relative_to=path, + ) + + # Initialize a translator for tests that utilize i18n + translator = _get_translator(pylons.config.get("lang")) + pylons.translator._push_object(translator) + + class FakeResponse: + headers = {} # because render wants to delete Pragma + + pylons.response._push_object(FakeResponse) diff --git a/ckanext/xloader/tests/fixtures.py b/ckanext/xloader/tests/fixtures.py new file mode 100644 index 00000000..b7d7b105 --- /dev/null +++ b/ckanext/xloader/tests/fixtures.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- +import sqlalchemy +import sqlalchemy.orm as orm +import os + +from ckan.tests import helpers +from ckanext.datastore.tests import helpers as datastore_helpers +from ckanext.xloader.loader import get_write_engine + +__location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__)) +) + +try: + from ckan.tests.pytest_ckan.fixtures import * + +except ImportError: + import pytest + + import ckan.tests.helpers as test_helpers + import ckan.plugins + import ckan.lib.search as search + + from ckan.common import config + + @pytest.fixture + def ckan_config(request, monkeypatch): + """Allows to override the configuration object used by tests + + Takes into account config patches introduced by the ``ckan_config`` + mark. + + If you just want to set one or more configuration options for the + scope of a test (or a test class), use the ``ckan_config`` mark:: + + @pytest.mark.ckan_config('ckan.auth.create_unowned_dataset', True) + def test_auth_create_unowned_dataset(): + + # ... + + To use the custom config inside a test, apply the + ``ckan_config`` mark to it and inject the ``ckan_config`` fixture: + + .. literalinclude:: /../ckan/tests/pytest_ckan/test_fixtures.py + :start-after: # START-CONFIG-OVERRIDE + :end-before: # END-CONFIG-OVERRIDE + + If the change only needs to be applied locally, use the + ``monkeypatch`` fixture + + .. literalinclude:: /../ckan/tests/test_common.py + :start-after: # START-CONFIG-OVERRIDE + :end-before: # END-CONFIG-OVERRIDE + + """ + _original = config.copy() + for mark in request.node.iter_markers(u"ckan_config"): + monkeypatch.setitem(config, *mark.args) + yield config + config.clear() + config.update(_original) + + @pytest.fixture + def make_app(ckan_config): + """Factory for client app instances. + + Unless you need to create app instances lazily for some reason, + use the ``app`` fixture instead. + """ + return test_helpers._get_test_app + + @pytest.fixture + def app(make_app): + """Returns a client app instance to use in functional tests + + To use it, just add the ``app`` parameter to your test function signature:: + + def test_dataset_search(self, app): + + url = h.url_for('dataset.search') + + response = app.get(url) + + + """ + return make_app() + + @pytest.fixture(scope=u"session") + def reset_db(): + """Callable for resetting the database to the initial state. + + If possible use the ``clean_db`` fixture instead. + + """ + return test_helpers.reset_db + + @pytest.fixture(scope=u"session") + def reset_index(): + """Callable for cleaning search index. + + If possible use the ``clean_index`` fixture instead. + """ + return search.clear_all + + @pytest.fixture + def clean_db(reset_db): + """Resets the database to the initial state. + + This can be used either for all tests in a class:: + + @pytest.mark.usefixtures("clean_db") + class TestExample(object): + + def test_example(self): + + or for a single test:: + + class TestExample(object): + + @pytest.mark.usefixtures("clean_db") + def test_example(self): + + """ + reset_db() + + @pytest.fixture + def clean_index(reset_index): + """Clear search index before starting the test. + """ + reset_index() + + @pytest.fixture + def with_plugins(ckan_config): + """Load all plugins specified by the ``ckan.plugins`` config option + at the beginning of the test. When the test ends (even it fails), it will + unload all the plugins in the reverse order. + + .. literalinclude:: /../ckan/tests/test_factories.py + :start-after: # START-CONFIG-OVERRIDE + :end-before: # END-CONFIG-OVERRIDE + + """ + plugins = ckan_config["ckan.plugins"].split() + for plugin in plugins: + if not ckan.plugins.plugin_loaded(plugin): + ckan.plugins.load(plugin) + yield + for plugin in reversed(plugins): + if ckan.plugins.plugin_loaded(plugin): + ckan.plugins.unload(plugin) + + @pytest.fixture + def test_request_context(app): + """Provide function for creating Flask request context. + """ + return app.flask_app.test_request_context + + @pytest.fixture + def with_request_context(test_request_context): + """Execute test inside requests context + """ + with test_request_context(): + yield + + +def reset_datastore_db(): + engine = get_write_engine() + Session = orm.scoped_session(orm.sessionmaker(bind=engine)) + datastore_helpers.clear_db(Session) + + +def add_full_text_trigger_function(): + engine = get_write_engine() + Session = orm.scoped_session(orm.sessionmaker(bind=engine)) + c = Session.connection() + with open( + os.path.join(__location__, "..", "..", "..", "full_text_function.sql"), + "r", + ) as full_text_sql: + c.execute(sqlalchemy.text(full_text_sql.read())) + Session.commit() + Session.remove() + + +@pytest.fixture() +def full_reset(reset_db): + reset_db() + reset_datastore_db() + add_full_text_trigger_function() diff --git a/ckanext/xloader/tests/test_action.py b/ckanext/xloader/tests/test_action.py index 53064c72..00219446 100644 --- a/ckanext/xloader/tests/test_action.py +++ b/ckanext/xloader/tests/test_action.py @@ -1,92 +1,82 @@ -from nose.tools import eq_ +import pytest import mock import ckan.plugins as p -try: - from ckan.tests import helpers, factories -except ImportError: - # older ckans - from ckan.new_tests import helpers, factories +from ckan.tests import helpers, factories -class TestAction(): - - @classmethod - def setup_class(cls): - if not p.plugin_loaded('datastore'): - p.load('datastore') - if not p.plugin_loaded('xloader'): - p.load('xloader') - - helpers.reset_db() - - @classmethod - def teardown_class(cls): - - p.unload('xloader') - p.unload('datastore') - - helpers.reset_db() - +@pytest.mark.usefixtures("clean_db", "with_plugins") +@pytest.mark.ckan_config("ckan.plugins", "datastore xloader") +class TestAction(object): def test_submit(self): # checks that xloader_submit enqueues the resource (to be xloadered) user = factories.User() # normally creating a resource causes xloader_submit to be called, # but we avoid that by setting an invalid format - res = factories.Resource(user=user, format='aaa') + res = factories.Resource(user=user, format="aaa") # mock the enqueue - with mock.patch('ckanext.xloader.action.enqueue_job', - return_value=mock.MagicMock(id=123)) as enqueue_mock: + with mock.patch( + "ckanext.xloader.action.enqueue_job", + return_value=mock.MagicMock(id=123), + ) as enqueue_mock: helpers.call_action( - 'xloader_submit', context=dict(user=user['name']), - resource_id=res['id']) - eq_(1, enqueue_mock.call_count) + "xloader_submit", + context=dict(user=user["name"]), + resource_id=res["id"], + ) + assert 1 == enqueue_mock.call_count def test_duplicated_submits(self): def submit(res, user): return helpers.call_action( - 'xloader_submit', context=dict(user=user['name']), - resource_id=res['id']) + "xloader_submit", + context=dict(user=user["name"]), + resource_id=res["id"], + ) user = factories.User() - with mock.patch('ckanext.xloader.action.enqueue_job', - return_value=mock.MagicMock(id=123)) as enqueue_mock: + with mock.patch( + "ckanext.xloader.action.enqueue_job", + return_value=mock.MagicMock(id=123), + ) as enqueue_mock: enqueue_mock.reset_mock() # creating the resource causes it to be queued - res = factories.Resource(user=user, format='csv') - eq_(1, enqueue_mock.call_count) - + res = factories.Resource(user=user, format="csv") + assert 1 == enqueue_mock.call_count # a second request to queue it will be stopped, because of the # existing task for this resource - shown by task_status_show submit(res, user) - eq_(1, enqueue_mock.call_count) + assert 1 == enqueue_mock.call_count def test_xloader_hook(self): # Check the task_status is stored correctly after a xloader job. user = factories.User() - res = factories.Resource(user=user, format='csv') + res = factories.Resource(user=user, format="csv") task_status = helpers.call_action( - 'task_status_update', context={}, - entity_id=res['id'], - entity_type='resource', - task_type='xloader', - key='xloader', - value='{}', - error='{}', - state='pending', + "task_status_update", + context={}, + entity_id=res["id"], + entity_type="resource", + task_type="xloader", + key="xloader", + value="{}", + error="{}", + state="pending", ) helpers.call_action( - 'xloader_hook', context=dict(user=user['name']), - metadata={'resource_id': res['id']}, - status='complete', - ) + "xloader_hook", + context=dict(user=user["name"]), + metadata={"resource_id": res["id"]}, + status="complete", + ) task_status = helpers.call_action( - 'task_status_show', context={}, - entity_id=res['id'], - task_type='xloader', - key='xloader', + "task_status_show", + context={}, + entity_id=res["id"], + task_type="xloader", + key="xloader", ) - eq_(task_status['state'], 'complete') + assert task_status["state"] == "complete" diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index 52d238a1..cf2ae40f 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -1,20 +1,21 @@ +from __future__ import absolute_import import os import json import random import datetime import time -try: - from collections import OrderedDict # from python 2.7 -except ImportError: - from sqlalchemy.util import OrderedDict +import six +from collections import OrderedDict # from python 2.7 +import pytest -from nose.tools import eq_, make_decorator, assert_in +from nose.tools import make_decorator import mock import responses from sqlalchemy import MetaData, Table from sqlalchemy.sql import select import ckan.plugins as p + try: config = p.toolkit.config except AttributeError: @@ -23,47 +24,51 @@ from ckanext.xloader import jobs from ckanext.xloader import db as jobs_db from ckanext.xloader.loader import get_write_engine -import util -try: - from ckan.tests import helpers, factories -except ImportError: - # older ckans - from ckan.new_tests import helpers, factories -SOURCE_URL = 'http://www.example.com/static/file' +from ckan.tests import helpers, factories + +SOURCE_URL = "http://www.example.com/static/file" def mock_actions(func): - ''' + """ Decorator that mocks actions used by these tests Based on ckan.test.helpers.mock_action - ''' + """ + def wrapper(*args, **kwargs): # Mock CKAN's resource_show API from ckan.logic import get_action as original_get_action def side_effect(called_action_name): - if called_action_name == 'resource_show': + if called_action_name == "resource_show": + def mock_resource_show(context, data_dict): return { - 'id': data_dict['id'], - 'name': 'short name', - 'url': SOURCE_URL, - 'format': '', - 'package_id': 'test-pkg', + "id": data_dict["id"], + "name": "short name", + "url": SOURCE_URL, + "format": "", + "package_id": "test-pkg", } + return mock_resource_show - elif called_action_name == 'package_show': + elif called_action_name == "package_show": + def mock_package_show(context, data_dict): return { - 'id': data_dict['id'], - 'name': 'pkg-name', + "id": data_dict["id"], + "name": "pkg-name", } + return mock_package_show else: return original_get_action(called_action_name) + try: - with mock.patch('ckanext.xloader.jobs.get_action') as mock_get_action: + with mock.patch( + "ckanext.xloader.jobs.get_action" + ) as mock_get_action: mock_get_action.side_effect = side_effect return_value = func(*args, **kwargs) @@ -76,30 +81,29 @@ def mock_package_show(context, data_dict): return make_decorator(func)(wrapper) -class TestxloaderDataIntoDatastore(util.PluginsMixin): - _load_plugins = ['datastore'] +@pytest.mark.skip +@pytest.mark.usefixtures("full_reset", "with_plugins") +@pytest.mark.ckan_config("ckan.plugins", "datastore xloader") +class TestxloaderDataIntoDatastore(object): - @classmethod - def setup_class(cls): - super(TestxloaderDataIntoDatastore, cls).setup_class() - cls.host = 'www.ckan.org' - cls.api_key = 'my-fake-key' - cls.resource_id = 'foo-bar-42' - factories.Resource(id=cls.resource_id) + @pytest.fixture(autouse=True) + def setup_class(self): + self.host = "www.ckan.org" + self.api_key = "my-fake-key" + self.resource_id = "foo-bar-42" + res = factories.Resource(id=self.resource_id) jobs_db.init(config, echo=False) # drop test table - engine, conn = cls.get_datastore_engine_and_connection() - conn.execute('DROP TABLE IF EXISTS "{}"'.format(cls.resource_id)) - - @classmethod - def teardown_class(cls): - super(TestxloaderDataIntoDatastore, cls).teardown_class() - if '_datastore' in dir(cls): - connection = cls._datastore[1] + engine, conn = self.get_datastore_engine_and_connection() + conn.execute('DROP TABLE IF EXISTS "{}"'.format(self.resource_id)) + yield + if "_datastore" in dir(self): + connection = self._datastore[1] connection.close() - def register_urls(self, filename='simple.csv', - content_type='application/csv'): + def register_urls( + self, filename="simple.csv", content_type="application/csv" + ): """Mock some test URLs with responses. Mocks some URLs related to a data file and a CKAN resource that @@ -110,37 +114,50 @@ def register_urls(self, filename='simple.csv', resource_show URL for the resource that contains the data file """ - responses.add_passthru(config['solr_url']) + responses.add_passthru(config["solr_url"]) # A URL that just returns a static file - responses.add(responses.GET, SOURCE_URL, - body=get_sample_file(filename), - content_type=content_type) + responses.add( + responses.GET, + SOURCE_URL, + body=get_sample_file(filename), + content_type=content_type, + ) # A URL that mocks the response that CKAN's resource_update API would # give after successfully updating a resource. resource_update_url = ( - 'http://www.ckan.org/api/3/action/resource_update') - responses.add(responses.POST, resource_update_url, - body=json.dumps({'success': True}), - content_type='application/json') + "http://www.ckan.org/api/3/action/resource_update" + ) + responses.add( + responses.POST, + resource_update_url, + body=json.dumps({"success": True}), + content_type="application/json", + ) # A URL that mock's the response that CKAN's datastore plugin's # datastore_delete API would give after successfully deleting a # resource from the datastore. - datastore_del_url = 'http://www.ckan.org/api/3/action/datastore_delete' - responses.add(responses.POST, datastore_del_url, - body=json.dumps({'success': True}), - content_type='application/json') + datastore_del_url = "http://www.ckan.org/api/3/action/datastore_delete" + responses.add( + responses.POST, + datastore_del_url, + body=json.dumps({"success": True}), + content_type="application/json", + ) - self.callback_url = 'http://www.ckan.org/api/3/action/xloader_hook' - responses.add(responses.POST, self.callback_url, - body=json.dumps({'success': True}), - content_type='application/json') + self.callback_url = "http://www.ckan.org/api/3/action/xloader_hook" + responses.add( + responses.POST, + self.callback_url, + body=json.dumps({"success": True}), + content_type="application/json", + ) @classmethod def get_datastore_engine_and_connection(cls): - if '_datastore' not in dir(cls): + if "_datastore" not in dir(cls): engine = get_write_engine() conn = engine.connect() cls._datastore = (engine, conn) @@ -149,24 +166,29 @@ def get_datastore_engine_and_connection(cls): def get_datastore_table(self): engine, conn = self.get_datastore_engine_and_connection() meta = MetaData(bind=engine, reflect=True) - table = Table(self.resource_id, meta, - autoload=True, autoload_with=engine) + table = Table( + self.resource_id, meta, autoload=True, autoload_with=engine + ) s = select([table]) with conn.begin(): result = conn.execute(s) return dict( num_rows=result.rowcount, - headers=result.keys(), - header_dict=OrderedDict([(c.key, str(c.type)) - for c in table.columns]), + headers=list(result.keys()), + header_dict=OrderedDict( + [(c.key, six.text_type(c.type)) for c in table.columns] + ), rows=result.fetchall(), ) def get_load_logs(self, task_id): conn = jobs_db.ENGINE.connect() logs = jobs_db.LOGS_TABLE - result = conn.execute(select([logs.c.level, logs.c.message]) - .where(logs.c.job_id == task_id)) + result = conn.execute( + select([logs.c.level, logs.c.message]).where( + logs.c.job_id == task_id + ) + ) return Logs(result.fetchall()) def get_time_of_last_analyze(self): @@ -177,11 +199,14 @@ def get_time_of_last_analyze(self): time.sleep(1) engine, conn = self.get_datastore_engine_and_connection() result = conn.execute( - ''' + """ SELECT last_analyze, last_autoanalyze FROM pg_stat_user_tables WHERE relname='{}'; - '''.format(self.resource_id)) + """.format( + self.resource_id + ) + ) last_analyze_datetimes = result.fetchall()[0] return max([x for x in last_analyze_datetimes if x] or [None]) @@ -189,382 +214,457 @@ def get_time_of_last_analyze(self): @responses.activate def test_simple_csv(self): # Test not only the load and xloader_hook is called at the end - self.register_urls(filename='simple.csv') + self.register_urls(filename="simple.csv") data = { - 'api_key': self.api_key, - 'job_type': 'xloader_to_datastore', - 'result_url': self.callback_url, - 'metadata': { - 'ckan_url': 'http://%s/' % self.host, - 'resource_id': self.resource_id - } + "api_key": self.api_key, + "job_type": "xloader_to_datastore", + "result_url": self.callback_url, + "metadata": { + "ckan_url": "http://%s/" % self.host, + "resource_id": self.resource_id, + }, } - job_id = 'test{}'.format(random.randint(0, 1e5)) + job_id = "test{}".format(random.randint(0, 1e5)) - with mock.patch('ckanext.xloader.jobs.set_resource_metadata') \ - as mocked_set_resource_metadata: + with mock.patch( + "ckanext.xloader.jobs.set_resource_metadata" + ) as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() - with mock.patch('ckanext.xloader.jobs.get_current_job', - return_value=mock.Mock(id=job_id)): + with mock.patch( + "ckanext.xloader.jobs.get_current_job", + return_value=mock.Mock(id=job_id), + ): result = jobs.xloader_data_into_datastore(data) - assert result is None, jobs_db.get_job(job_id)['error']['message'] + assert result is None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful - eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') + assert ( + responses.calls[-1].request.url + == "http://www.ckan.org/api/3/action/xloader_hook" + ) job_dict = json.loads(responses.calls[-1].request.body) - assert job_dict['status'] == u'complete', job_dict - eq_(job_dict, - {u'metadata': {u'datastore_contains_all_records_of_source_file': True, - u'datastore_active': True, - u'ckan_url': u'http://www.ckan.org/', - u'resource_id': u'foo-bar-42'}, - u'status': u'complete'}) + assert job_dict["status"] == u"complete", job_dict + assert job_dict == { + u"metadata": { + u"datastore_contains_all_records_of_source_file": True, + u"datastore_active": True, + u"ckan_url": u"http://www.ckan.org/", + u"resource_id": u"foo-bar-42", + }, + u"status": u"complete", + } # Check the load data = self.get_datastore_table() - eq_(data['headers'], - ['_id', '_full_text', 'date', 'temperature', 'place']) - eq_(data['header_dict']['date'], 'TEXT') + assert data["headers"] == [ + "_id", + "_full_text", + "date", + "temperature", + "place", + ] + assert data["header_dict"]["date"] == "TEXT" # 'TIMESTAMP WITHOUT TIME ZONE') - eq_(data['header_dict']['temperature'], 'TEXT') # 'NUMERIC') - eq_(data['header_dict']['place'], 'TEXT') # 'TEXT') - eq_(data['num_rows'], 6) - eq_(data['rows'][0][2:], - (u'2011-01-01', u'1', u'Galway')) + assert data["header_dict"]["temperature"] == "TEXT" # 'NUMERIC') + assert data["header_dict"]["place"] == "TEXT" # 'TEXT') + assert data["num_rows"] == 6 + assert data["rows"][0][2:] == (u"2011-01-01", u"1", u"Galway") # (datetime.datetime(2011, 1, 1), 1, 'Galway')) # Check it wanted to set the datastore_active=True mocked_set_resource_metadata.assert_called_once() - eq_(mocked_set_resource_metadata.call_args[1]['update_dict'], - {'datastore_contains_all_records_of_source_file': True, - 'datastore_active': True, - 'ckan_url': 'http://www.ckan.org/', - 'resource_id': 'foo-bar-42'}) + assert mocked_set_resource_metadata.call_args[1]["update_dict"] == { + "datastore_contains_all_records_of_source_file": True, + "datastore_active": True, + "ckan_url": "http://www.ckan.org/", + "resource_id": "foo-bar-42", + } logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) - eq_(job['status'], u'complete') - eq_(job['error'], None) + assert job["status"] == u"complete" + assert job["error"] == None # Check ANALYZE was run last_analyze = self.get_time_of_last_analyze() - assert(last_analyze) + assert last_analyze @mock_actions @responses.activate - @mock.patch('ckanext.xloader.jobs.MAX_CONTENT_LENGTH', 10000) - @mock.patch('ckanext.xloader.jobs.MAX_EXCERPT_LINES', 100) + @mock.patch("ckanext.xloader.jobs.MAX_CONTENT_LENGTH", 10000) + @mock.patch("ckanext.xloader.jobs.MAX_EXCERPT_LINES", 100) def test_too_large_csv(self): # Test not only the load and xloader_hook is called at the end - self.register_urls(filename='simple-large.csv') + self.register_urls(filename="simple-large.csv") data = { - 'api_key': self.api_key, - 'job_type': 'xloader_to_datastore', - 'result_url': self.callback_url, - 'metadata': { - 'ckan_url': 'http://%s/' % self.host, - 'resource_id': self.resource_id - } + "api_key": self.api_key, + "job_type": "xloader_to_datastore", + "result_url": self.callback_url, + "metadata": { + "ckan_url": "http://%s/" % self.host, + "resource_id": self.resource_id, + }, } - job_id = 'test{}'.format(random.randint(0, 1e5)) + job_id = "test{}".format(random.randint(0, 1e5)) - with mock.patch('ckanext.xloader.jobs.set_resource_metadata') \ - as mocked_set_resource_metadata: + with mock.patch( + "ckanext.xloader.jobs.set_resource_metadata" + ) as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() - with mock.patch('ckanext.xloader.jobs.get_current_job', - return_value=mock.Mock(id=job_id)): + with mock.patch( + "ckanext.xloader.jobs.get_current_job", + return_value=mock.Mock(id=job_id), + ): result = jobs.xloader_data_into_datastore(data) - assert result is None, jobs_db.get_job(job_id)['error']['message'] + assert result is None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful - eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') + assert ( + responses.calls[-1].request.url + == "http://www.ckan.org/api/3/action/xloader_hook" + ) job_dict = json.loads(responses.calls[-1].request.body) - assert job_dict['status'] == u'complete', job_dict - eq_(job_dict, - {u'metadata': {u'datastore_contains_all_records_of_source_file': False, - u'datastore_active': True, - u'ckan_url': u'http://www.ckan.org/', - u'resource_id': u'foo-bar-42'}, - u'status': u'complete'}) + assert job_dict["status"] == u"complete", job_dict + assert job_dict == { + u"metadata": { + u"datastore_contains_all_records_of_source_file": False, + u"datastore_active": True, + u"ckan_url": u"http://www.ckan.org/", + u"resource_id": u"foo-bar-42", + }, + u"status": u"complete", + } # Check the load data = self.get_datastore_table() - eq_(data['headers'], - ['_id', '_full_text', 'id', 'text']) - eq_(data['header_dict']['id'], 'TEXT') + assert data["headers"] == ["_id", "_full_text", "id", "text"] + assert data["header_dict"]["id"] == "TEXT" # 'TIMESTAMP WITHOUT TIME ZONE') - eq_(data['header_dict']['text'], 'TEXT') - assert data['num_rows'] <= 100 - assert data['num_rows'] > 0 - eq_(data['rows'][0][2:], - (u'1', u'a')) + assert data["header_dict"]["text"] == "TEXT" + assert data["num_rows"] <= 100 + assert data["num_rows"] > 0 + assert data["rows"][0][2:] == (u"1", u"a") # Check it wanted to set the datastore_active=True mocked_set_resource_metadata.assert_called_once() - eq_(mocked_set_resource_metadata.call_args[1]['update_dict'], - {'datastore_contains_all_records_of_source_file': False, - 'datastore_active': True, - 'ckan_url': 'http://www.ckan.org/', - 'resource_id': 'foo-bar-42'}) + assert mocked_set_resource_metadata.call_args[1]["update_dict"] == { + "datastore_contains_all_records_of_source_file": False, + "datastore_active": True, + "ckan_url": "http://www.ckan.org/", + "resource_id": "foo-bar-42", + } logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) - eq_(job['status'], u'complete') - eq_(job['error'], None) + assert job["status"] == u"complete" + assert job["error"] == None # Check ANALYZE was run last_analyze = self.get_time_of_last_analyze() - assert(last_analyze) + assert last_analyze @mock_actions @responses.activate - @mock.patch('ckanext.xloader.jobs.MAX_CONTENT_LENGTH', 10000) - @mock.patch('ckanext.xloader.jobs.MAX_EXCERPT_LINES', 100) + @mock.patch("ckanext.xloader.jobs.MAX_CONTENT_LENGTH", 10000) + @mock.patch("ckanext.xloader.jobs.MAX_EXCERPT_LINES", 100) def test_too_large_xls(self): # Test not only the load and xloader_hook is called at the end - self.register_urls(filename='simple-large.xls') + self.register_urls(filename="simple-large.xls") data = { - 'api_key': self.api_key, - 'job_type': 'xloader_to_datastore', - 'result_url': self.callback_url, - 'metadata': { - 'ckan_url': 'http://%s/' % self.host, - 'resource_id': self.resource_id - } + "api_key": self.api_key, + "job_type": "xloader_to_datastore", + "result_url": self.callback_url, + "metadata": { + "ckan_url": "http://%s/" % self.host, + "resource_id": self.resource_id, + }, } - job_id = 'test{}'.format(random.randint(0, 1e5)) + job_id = "test{}".format(random.randint(0, 1e5)) - with mock.patch('ckanext.xloader.jobs.set_resource_metadata'): + with mock.patch("ckanext.xloader.jobs.set_resource_metadata"): # in tests we call jobs directly, rather than use rq, so mock # get_current_job() - with mock.patch('ckanext.xloader.jobs.get_current_job', - return_value=mock.Mock(id=job_id)): + with mock.patch( + "ckanext.xloader.jobs.get_current_job", + return_value=mock.Mock(id=job_id), + ): result = jobs.xloader_data_into_datastore(data) - assert result is not None, jobs_db.get_job(job_id)['error']['message'] + assert result is not None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful - eq_(responses.calls[-1].request.url, - 'http://www.ckan.org/api/3/action/xloader_hook') + assert ( + responses.calls[-1].request.url + == "http://www.ckan.org/api/3/action/xloader_hook" + ) job_dict = json.loads(responses.calls[-1].request.body) - assert job_dict['status'] == u'error', job_dict - eq_(job_dict, - {u'status': u'error', - u'metadata': {u'ckan_url': u'http://www.ckan.org/', - u'datastore_contains_all_records_of_source_file': False, - u'resource_id': u'foo-bar-42'}, - u'error': u'Loading file raised an error: array index out of range'}) + assert job_dict["status"] == u"error", job_dict + assert job_dict == { + u"status": u"error", + u"metadata": { + u"ckan_url": u"http://www.ckan.org/", + u"datastore_contains_all_records_of_source_file": False, + u"resource_id": u"foo-bar-42", + }, + u"error": u"Loading file raised an error: array index out of range", + } job = jobs_db.get_job(job_id) - eq_(job['status'], u'error') - eq_(job['error'], {u'message': u'Loading file raised an error: array index out of range'}) + assert job["status"] == u"error" + assert job["error"] == { + u"message": u"Loading file raised an error: array index out of range" + } @mock_actions @responses.activate def test_messytables(self): # xloader's COPY can't handle xls, so it will be dealt with by # messytables - self.register_urls(filename='simple.xls', - content_type='application/vnd.ms-excel') + self.register_urls( + filename="simple.xls", content_type="application/vnd.ms-excel" + ) data = { - 'api_key': self.api_key, - 'job_type': 'xloader_to_datastore', - 'result_url': self.callback_url, - 'metadata': { - 'ckan_url': 'http://%s/' % self.host, - 'resource_id': self.resource_id - } + "api_key": self.api_key, + "job_type": "xloader_to_datastore", + "result_url": self.callback_url, + "metadata": { + "ckan_url": "http://%s/" % self.host, + "resource_id": self.resource_id, + }, } - job_id = 'test{}'.format(random.randint(0, 1e5)) + job_id = "test{}".format(random.randint(0, 1e5)) - with mock.patch('ckanext.xloader.jobs.set_resource_metadata') \ - as mocked_set_resource_metadata: + with mock.patch( + "ckanext.xloader.jobs.set_resource_metadata" + ) as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() - with mock.patch('ckanext.xloader.jobs.get_current_job', - return_value=mock.Mock(id=job_id)): + with mock.patch( + "ckanext.xloader.jobs.get_current_job", + return_value=mock.Mock(id=job_id), + ): result = jobs.xloader_data_into_datastore(data) - eq_(result, None) + assert result == None # Check it said it was successful - eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') + assert ( + responses.calls[-1].request.url + == "http://www.ckan.org/api/3/action/xloader_hook" + ) job_dict = json.loads(responses.calls[-1].request.body) - assert job_dict['status'] == u'complete', job_dict - eq_(job_dict, - {u'metadata': {u'datastore_contains_all_records_of_source_file': True, - u'datastore_active': True, - u'ckan_url': u'http://www.ckan.org/', - u'resource_id': u'foo-bar-42'}, - u'status': u'complete'}) + assert job_dict["status"] == u"complete", job_dict + assert job_dict == { + u"metadata": { + u"datastore_contains_all_records_of_source_file": True, + u"datastore_active": True, + u"ckan_url": u"http://www.ckan.org/", + u"resource_id": u"foo-bar-42", + }, + u"status": u"complete", + } # Check the load data = self.get_datastore_table() - eq_(data['headers'], - ['_id', '_full_text', 'date', 'temperature', 'place']) - eq_(data['header_dict']['date'], 'TIMESTAMP WITHOUT TIME ZONE') - eq_(data['header_dict']['temperature'], 'NUMERIC') - eq_(data['header_dict']['place'], 'TEXT') - eq_(data['num_rows'], 6) - eq_(data['rows'][0][2:], - (datetime.datetime(2011, 1, 1), 1, u'Galway')) + assert data["headers"] == [ + "_id", + "_full_text", + "date", + "temperature", + "place", + ] + assert data["header_dict"]["date"] == "TIMESTAMP WITHOUT TIME ZONE" + assert data["header_dict"]["temperature"] == "NUMERIC" + assert data["header_dict"]["place"] == "TEXT" + assert data["num_rows"] == 6 + assert data["rows"][0][2:] == ( + datetime.datetime(2011, 1, 1), + 1, + u"Galway", + ) # Check it wanted to set the datastore_active=True mocked_set_resource_metadata.assert_called_once() - eq_(mocked_set_resource_metadata.call_args[1]['update_dict'], - {'ckan_url': 'http://www.ckan.org/', - 'datastore_contains_all_records_of_source_file': True, - 'datastore_active': True, - 'resource_id': 'foo-bar-42'}) + assert mocked_set_resource_metadata.call_args[1]["update_dict"] == { + "ckan_url": "http://www.ckan.org/", + "datastore_contains_all_records_of_source_file": True, + "datastore_active": True, + "resource_id": "foo-bar-42", + } # check logs have the error doing the COPY logs = self.get_load_logs(job_id) copy_error_index = None for i, log in enumerate(logs): - if log[0] == 'WARNING' and log[1].startswith('Load using COPY failed: Error during the load into PostgreSQL'): + if log[0] == "WARNING" and log[1].startswith( + "Load using COPY failed: Error during the load into PostgreSQL" + ): copy_error_index = i break - assert copy_error_index, 'Missing COPY error' + assert copy_error_index, "Missing COPY error" # check messytable portion of the logs - logs = Logs(logs[copy_error_index + 1:]) - eq_(logs[0], (u'INFO', u'Trying again with messytables')) + logs = Logs(logs[copy_error_index + 1 :]) + assert logs[0] == (u"INFO", u"Trying again with messytables") logs.assert_no_errors() # Check ANALYZE was run last_analyze = self.get_time_of_last_analyze() - assert(last_analyze) + assert last_analyze @mock_actions @responses.activate def test_umlaut_and_extra_comma(self): - self.register_urls(filename='umlaut_and_extra_comma.csv') + self.register_urls(filename="umlaut_and_extra_comma.csv") # This csv has an extra comma which causes the COPY to throw a # psycopg2.DataError and the umlaut can cause problems for logging the # error. We need to check that it correctly reverts to using # messytables to load it data = { - 'api_key': self.api_key, - 'job_type': 'xloader_to_datastore', - 'result_url': self.callback_url, - 'metadata': { - 'ckan_url': 'http://%s/' % self.host, - 'resource_id': self.resource_id - } + "api_key": self.api_key, + "job_type": "xloader_to_datastore", + "result_url": self.callback_url, + "metadata": { + "ckan_url": "http://%s/" % self.host, + "resource_id": self.resource_id, + }, } - job_id = 'test{}'.format(random.randint(0, 1e5)) + job_id = "test{}".format(random.randint(0, 1e5)) - with mock.patch('ckanext.xloader.jobs.set_resource_metadata'): + with mock.patch("ckanext.xloader.jobs.set_resource_metadata"): # in tests we call jobs directly, rather than use rq, so mock # get_current_job() - with mock.patch('ckanext.xloader.jobs.get_current_job', - return_value=mock.Mock(id=job_id)): + with mock.patch( + "ckanext.xloader.jobs.get_current_job", + return_value=mock.Mock(id=job_id), + ): result = jobs.xloader_data_into_datastore(data) - assert result is None, jobs_db.get_job(job_id)['error']['message'] + assert result is None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful - eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') + assert ( + responses.calls[-1].request.url + == "http://www.ckan.org/api/3/action/xloader_hook" + ) job_dict = json.loads(responses.calls[-1].request.body) - assert job_dict['status'] == u'complete', job_dict - eq_(job_dict, - {u'metadata': {u'datastore_contains_all_records_of_source_file': True, - u'datastore_active': True, - u'ckan_url': u'http://www.ckan.org/', - u'resource_id': u'foo-bar-42'}, - u'status': u'complete'}) + assert job_dict["status"] == u"complete", job_dict + assert job_dict == { + u"metadata": { + u"datastore_contains_all_records_of_source_file": True, + u"datastore_active": True, + u"ckan_url": u"http://www.ckan.org/", + u"resource_id": u"foo-bar-42", + }, + u"status": u"complete", + } logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) - eq_(job['status'], u'complete') - eq_(job['error'], None) + assert job["status"] == u"complete" + assert job["error"] == None @mock_actions @responses.activate def test_first_request_is_202_pending_response(self): # when you first get the CSV it returns this 202 response, which is # what this server does: https://data-cdfw.opendata.arcgis.com/datasets - responses.add(responses.GET, SOURCE_URL, - status=202, - body='{"processingTime":"8.716 seconds","status":"Processing","generating":{}}', - content_type='application/json') + responses.add( + responses.GET, + SOURCE_URL, + status=202, + body='{"processingTime":"8.716 seconds","status":"Processing","generating":{}}', + content_type="application/json", + ) # subsequent GETs of the CSV work fine self.register_urls() data = { - 'api_key': self.api_key, - 'job_type': 'xloader_to_datastore', - 'result_url': self.callback_url, - 'metadata': { - 'ckan_url': 'http://%s/' % self.host, - 'resource_id': self.resource_id - } + "api_key": self.api_key, + "job_type": "xloader_to_datastore", + "result_url": self.callback_url, + "metadata": { + "ckan_url": "http://%s/" % self.host, + "resource_id": self.resource_id, + }, } - job_id = 'test{}'.format(random.randint(0, 1e5)) + job_id = "test{}".format(random.randint(0, 1e5)) - with mock.patch('ckanext.xloader.jobs.set_resource_metadata') \ - as mocked_set_resource_metadata: + with mock.patch( + "ckanext.xloader.jobs.set_resource_metadata" + ) as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() - with mock.patch('ckanext.xloader.jobs.get_current_job', - return_value=mock.Mock(id=job_id)): + with mock.patch( + "ckanext.xloader.jobs.get_current_job", + return_value=mock.Mock(id=job_id), + ): result = jobs.xloader_data_into_datastore(data) - assert result is None, jobs_db.get_job(job_id)['error']['message'] + assert result is None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful - eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') + assert ( + responses.calls[-1].request.url + == "http://www.ckan.org/api/3/action/xloader_hook" + ) job_dict = json.loads(responses.calls[-1].request.body) - assert job_dict['status'] == u'complete', job_dict - eq_(job_dict, - {u'metadata': {u'ckan_url': u'http://www.ckan.org/', - u'datastore_contains_all_records_of_source_file': True, - u'datastore_active': True, - u'resource_id': u'foo-bar-42'}, - u'status': u'complete'}) + assert job_dict["status"] == u"complete", job_dict + assert job_dict == { + u"metadata": { + u"ckan_url": u"http://www.ckan.org/", + u"datastore_contains_all_records_of_source_file": True, + u"datastore_active": True, + u"resource_id": u"foo-bar-42", + }, + u"status": u"complete", + } # Check the load data = self.get_datastore_table() - eq_(data['headers'], - ['_id', '_full_text', 'date', 'temperature', 'place']) - eq_(data['header_dict']['date'], 'TEXT') + assert data["headers"] == [ + "_id", + "_full_text", + "date", + "temperature", + "place", + ] + assert data["header_dict"]["date"] == "TEXT" # 'TIMESTAMP WITHOUT TIME ZONE') - eq_(data['header_dict']['temperature'], 'TEXT') # 'NUMERIC') - eq_(data['header_dict']['place'], 'TEXT') # 'TEXT') - eq_(data['num_rows'], 6) - eq_(data['rows'][0][2:], - (u'2011-01-01', u'1', u'Galway')) + assert data["header_dict"]["temperature"] == "TEXT" # 'NUMERIC') + assert data["header_dict"]["place"] == "TEXT" # 'TEXT') + assert data["num_rows"] == 6 + assert data["rows"][0][2:] == (u"2011-01-01", u"1", u"Galway") # (datetime.datetime(2011, 1, 1), 1, 'Galway')) # Check it wanted to set the datastore_active=True mocked_set_resource_metadata.assert_called_once() - eq_(mocked_set_resource_metadata.call_args[1]['update_dict'], - {'datastore_contains_all_records_of_source_file': True, - 'datastore_active': True, - 'ckan_url': 'http://www.ckan.org/', - 'resource_id': 'foo-bar-42'}) + assert mocked_set_resource_metadata.call_args[1]["update_dict"] == { + "datastore_contains_all_records_of_source_file": True, + "datastore_active": True, + "ckan_url": "http://www.ckan.org/", + "resource_id": "foo-bar-42", + } logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) - eq_(job['status'], u'complete') - eq_(job['error'], None) + assert job["status"] == u"complete" + assert job["error"] is None class Logs(list): def get_errors(self): - return [message for level, message in self - if level == 'ERROR'] + return [message for level, message in self if level == "ERROR"] def grep(self, text): - return [message for level, message in self - if text in message] + return [message for level, message in self if text in message] def assert_no_errors(self): errors = self.get_errors() @@ -572,7 +672,7 @@ def assert_no_errors(self): def get_sample_file(filename): - filepath = os.path.join(os.path.dirname(__file__), 'samples', filename) + filepath = os.path.join(os.path.dirname(__file__), "samples", filename) return open(filepath).read() @@ -585,18 +685,24 @@ def test_simple(self): resource = factories.Resource() jobs.set_resource_metadata( - {'datastore_contains_all_records_of_source_file': True, - 'datastore_active': True, - 'ckan_url': 'http://www.ckan.org/', - 'resource_id': resource['id']}) + { + "datastore_contains_all_records_of_source_file": True, + "datastore_active": True, + "ckan_url": "http://www.ckan.org/", + "resource_id": resource["id"], + } + ) - resource = helpers.call_action('resource_show', id=resource['id']) + resource = helpers.call_action("resource_show", id=resource["id"]) from pprint import pprint + pprint(resource) - assert_in(resource['datastore_contains_all_records_of_source_file'], - (True, u'True')) + assert resource["datastore_contains_all_records_of_source_file"] in ( + True, + u"True", + ) # I'm not quite sure why this is a string on travis - I get the bool # locally - eq_(resource['datastore_active'], True) - eq_(resource['ckan_url'], 'http://www.ckan.org/') + assert resource["datastore_active"] + assert resource["ckan_url"] == "http://www.ckan.org/" diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 2cf1c7d0..8dcff4f7 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- +from __future__ import print_function +from __future__ import absolute_import import os - +import pytest import sqlalchemy.orm as orm -from nose.tools import assert_equal, assert_raises, assert_in, nottest -from nose.plugins.skip import SkipTest import datetime from decimal import Decimal @@ -13,468 +13,1111 @@ from ckanext.xloader.job_exceptions import LoaderError import ckan.plugins as p -import util def get_sample_filepath(filename): - return os.path.abspath(os.path.join(os.path.dirname(__file__), 'samples', - filename)) + return os.path.abspath( + os.path.join(os.path.dirname(__file__), "samples", filename) + ) class PrintLogger(object): def __getattr__(self, log_level): def print_func(msg): - time = datetime.datetime.now().strftime('%H:%M:%S') - print '{} {}: {}'.format(time, log_level.capitalize(), msg) - return print_func + time = datetime.datetime.now().strftime("%H:%M:%S") + print("{} {}: {}".format(time, log_level.capitalize(), msg)) + return print_func -class TestLoadBase(util.PluginsMixin): - _load_plugins = ['datastore'] - def setup(self): - engine = get_write_engine() - self.Session = orm.scoped_session(orm.sessionmaker(bind=engine)) - helpers.reset_db() - util.reset_datastore_db() - util.add_full_text_trigger_function() +@pytest.fixture() +def Session(): + engine = get_write_engine() + Session = orm.scoped_session(orm.sessionmaker(bind=engine)) + yield Session + Session.close() - def teardown(self): - self.Session.close() - def _get_records(self, table_name, limit=None, - exclude_full_text_column=True): - c = self.Session.connection() +@pytest.mark.usefixtures("full_reset", "with_plugins") +@pytest.mark.ckan_config("ckan.plugins", "datastore xloader") +class TestLoadBase(object): + def _get_records( + self, Session, table_name, limit=None, exclude_full_text_column=True + ): + c = Session.connection() if exclude_full_text_column: - cols = self._get_column_names(table_name) - cols = ', '.join(loader.identifier(col) for col in cols - if col != '_full_text') + cols = self._get_column_names(Session, table_name) + cols = ", ".join( + loader.identifier(col) for col in cols if col != "_full_text" + ) else: - cols = '*' - sql = 'SELECT {cols} FROM "{table_name}"' \ - .format(cols=cols, table_name=table_name) + cols = "*" + sql = 'SELECT {cols} FROM "{table_name}"'.format( + cols=cols, table_name=table_name + ) if limit is not None: - sql += ' LIMIT {}'.format(limit) + sql += " LIMIT {}".format(limit) results = c.execute(sql) return results.fetchall() - def _get_column_names(self, table_name): + def _get_column_names(self, Session, table_name): # SELECT column_name FROM information_schema.columns WHERE table_name='test1'; - c = self.Session.connection() - sql = "SELECT column_name FROM information_schema.columns " \ - "WHERE table_name='{}';".format(table_name) + c = Session.connection() + sql = ( + "SELECT column_name FROM information_schema.columns " + "WHERE table_name='{}';".format(table_name) + ) results = c.execute(sql) records = results.fetchall() return [r[0] for r in records] - def _get_column_types(self, table_name): - c = self.Session.connection() - sql = "SELECT udt_name FROM information_schema.columns " \ - "WHERE table_name='{}';".format(table_name) + def _get_column_types(self, Session, table_name): + c = Session.connection() + sql = ( + "SELECT udt_name FROM information_schema.columns " + "WHERE table_name='{}';".format(table_name) + ) results = c.execute(sql) records = results.fetchall() return [r[0] for r in records] class TestLoadCsv(TestLoadBase): - - def test_simple(self): - csv_filepath = get_sample_filepath('simple.csv') - resource_id = 'test1' + def test_simple(self, Session): + csv_filepath = get_sample_filepath("simple.csv") + resource_id = "test1" factories.Resource(id=resource_id) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - - assert_equal(self._get_records( - 'test1', limit=1, exclude_full_text_column=False), - [(1, "'-01':2,3 '1':4 '2011':1 'galway':5", u'2011-01-01', u'1', u'Galway')]) - assert_equal(self._get_records('test1'), - [(1, u'2011-01-01', u'1', u'Galway'), - (2, u'2011-01-02', u'-1', u'Galway'), - (3, u'2011-01-03', u'0', u'Galway'), - (4, u'2011-01-01', u'6', u'Berkeley'), - (5, None, None, u'Berkeley'), - (6, u'2011-01-03', u'5', None)]) - assert_equal( - self._get_column_names('test1'), - [u'_id', u'_full_text', u'date', u'temperature', u'place']) - assert_equal( - self._get_column_types('test1'), - [u'int4', u'tsvector', u'text', u'text', u'text']) - - def test_simple_with_indexing(self): - csv_filepath = get_sample_filepath('simple.csv') - resource_id = 'test1' + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + + assert self._get_records( + Session, "test1", limit=1, exclude_full_text_column=False + ) == [ + ( + 1, + "'-01':2,3 '1':4 '2011':1 'galway':5", + u"2011-01-01", + u"1", + u"Galway", + ) + ] + assert self._get_records(Session, "test1") == [ + (1, u"2011-01-01", u"1", u"Galway"), + (2, u"2011-01-02", u"-1", u"Galway"), + (3, u"2011-01-03", u"0", u"Galway"), + (4, u"2011-01-01", u"6", u"Berkeley"), + (5, None, None, u"Berkeley"), + (6, u"2011-01-03", u"5", None), + ] + assert self._get_column_names(Session, "test1") == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + assert self._get_column_types(Session, "test1") == [ + u"int4", + u"tsvector", + u"text", + u"text", + u"text", + ] + + def test_simple_with_indexing(self, Session): + csv_filepath = get_sample_filepath("simple.csv") + resource_id = "test1" factories.Resource(id=resource_id) - fields = loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - loader.create_column_indexes(fields=fields, resource_id=resource_id, - logger=PrintLogger()) + fields = loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + loader.create_column_indexes( + fields=fields, resource_id=resource_id, logger=PrintLogger() + ) - assert_equal(self._get_records( - 'test1', limit=1, exclude_full_text_column=False)[0][1], - "'-01':2,3 '1':4 '2011':1 'galway':5") + assert ( + self._get_records( + Session, "test1", limit=1, exclude_full_text_column=False + )[0][1] + == "'-01':2,3 '1':4 '2011':1 'galway':5" + ) # test disabled by default to avoid adding large file to repo and slow test - @nottest + @pytest.mark.skip def test_boston_311_complete(self): # to get the test file: # curl -o ckanext/xloader/tests/samples/boston_311.csv https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2968e2c0-d479-49ba-a884-4ef523ada3c0/download/311.csv # noqa - csv_filepath = get_sample_filepath('boston_311.csv') - resource_id = 'test1' + csv_filepath = get_sample_filepath("boston_311.csv") + resource_id = "test1" factories.Resource(id=resource_id) import time + t0 = time.time() - print '{} Start load'.format(time.strftime('%H:%M:%S', time.localtime(t0))) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - print 'Load: {}s'.format(time.time() - t0) + print( + "{} Start load".format( + time.strftime("%H:%M:%S", time.localtime(t0)) + ) + ) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + print("Load: {}s".format(time.time() - t0)) # test disabled by default to avoid adding large file to repo and slow test - @nottest + @pytest.mark.skip def test_boston_311_sample5(self): # to create the test file: # head -n 100001 ckanext/xloader/tests/samples/boston_311.csv > ckanext/xloader/tests/samples/boston_311_sample5.csv - csv_filepath = get_sample_filepath('boston_311_sample5.csv') - resource_id = 'test1' + csv_filepath = get_sample_filepath("boston_311_sample5.csv") + resource_id = "test1" factories.Resource(id=resource_id) import time + t0 = time.time() - print '{} Start load'.format(time.strftime('%H:%M:%S', time.localtime(t0))) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - print 'Load: {}s'.format(time.time() - t0) - - def test_boston_311(self): - csv_filepath = get_sample_filepath('boston_311_sample.csv') - resource_id = 'test1' - factories.Resource(id=resource_id) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - - records = self._get_records('test1') - print records - assert_equal( - records, - [(1, u'101002153891', u'2017-07-06 23:38:43', u'2017-07-21 08:30:00', None, u'ONTIME', u'Open', u' ', u'Street Light Outages', u'Public Works Department', u'Street Lights', u'Street Light Outages', u'PWDx_Street Light Outages', u'PWDx', None, None, u'480 Harvard St Dorchester MA 02124', u'8', u'07', u'4', u'B3', u'Greater Mattapan', u'9', u'Ward 14', u'1411', u'480 Harvard St', u'02124', u'42.288', u'-71.0927', u'Citizens Connect App'), # noqa - (2, u'101002153890', u'2017-07-06 23:29:13', u'2017-09-11 08:30:00', None, u'ONTIME', u'Open', u' ', u'Graffiti Removal', u'Property Management', u'Graffiti', u'Graffiti Removal', u'PROP_GRAF_GraffitiRemoval', u'PROP', u' https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg', None, u'522 Saratoga St East Boston MA 02128', u'1', u'09', u'1', u'A7', u'East Boston', u'1', u'Ward 1', u'0110', u'522 Saratoga St', u'02128', u'42.3807', u'-71.0259', u'Citizens Connect App'), # noqa - (3, u'101002153889', u'2017-07-06 23:24:20', u'2017-09-11 08:30:00', None, u'ONTIME', u'Open', u' ', u'Graffiti Removal', u'Property Management', u'Graffiti', u'Graffiti Removal', u'PROP_GRAF_GraffitiRemoval', u'PROP', u' https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg', None, u'965 Bennington St East Boston MA 02128', u'1', u'09', u'1', u'A7', u'East Boston', u'1', u'Ward 1', u'0112', u'965 Bennington St', u'02128', u'42.386', u'-71.008', u'Citizens Connect App')] # noqa + print( + "{} Start load".format( + time.strftime("%H:%M:%S", time.localtime(t0)) ) - print self._get_column_names('test1') - assert_equal( - self._get_column_names('test1'), - [u'_id', u'_full_text', u'CASE_ENQUIRY_ID', u'open_dt', u'target_dt', u'closed_dt', u'OnTime_Status', u'CASE_STATUS', u'CLOSURE_REASON', u'CASE_TITLE', u'SUBJECT', u'REASON', u'TYPE', u'QUEUE', u'Department', u'SubmittedPhoto', u'ClosedPhoto', u'Location', u'Fire_district', u'pwd_district', u'city_council_district', u'police_district', u'neighborhood', u'neighborhood_services_district', u'ward', u'precinct', u'LOCATION_STREET_NAME', u'LOCATION_ZIPCODE', u'Latitude', u'Longitude', u'Source']) # noqa - print self._get_column_types('test1') - assert_equal(self._get_column_types('test1'), - [u'int4', u'tsvector'] + - [u'text'] * (len(records[0]) - 1)) - - def test_brazilian(self): - csv_filepath = get_sample_filepath('brazilian_sample.csv') - resource_id = 'test1' + ) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + print("Load: {}s".format(time.time() - t0)) + + def test_boston_311(self, Session): + csv_filepath = get_sample_filepath("boston_311_sample.csv") + resource_id = "test1" + factories.Resource(id=resource_id) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + + records = self._get_records(Session, "test1") + print(records) + assert records == [ + ( + 1, + u"101002153891", + u"2017-07-06 23:38:43", + u"2017-07-21 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", + u"Street Light Outages", + u"Public Works Department", + u"Street Lights", + u"Street Light Outages", + u"PWDx_Street Light Outages", + u"PWDx", + None, + None, + u"480 Harvard St Dorchester MA 02124", + u"8", + u"07", + u"4", + u"B3", + u"Greater Mattapan", + u"9", + u"Ward 14", + u"1411", + u"480 Harvard St", + u"02124", + u"42.288", + u"-71.0927", + u"Citizens Connect App", + ), # noqa + ( + 2, + u"101002153890", + u"2017-07-06 23:29:13", + u"2017-09-11 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", + None, + u"522 Saratoga St East Boston MA 02128", + u"1", + u"09", + u"1", + u"A7", + u"East Boston", + u"1", + u"Ward 1", + u"0110", + u"522 Saratoga St", + u"02128", + u"42.3807", + u"-71.0259", + u"Citizens Connect App", + ), # noqa + ( + 3, + u"101002153889", + u"2017-07-06 23:24:20", + u"2017-09-11 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", + None, + u"965 Bennington St East Boston MA 02128", + u"1", + u"09", + u"1", + u"A7", + u"East Boston", + u"1", + u"Ward 1", + u"0112", + u"965 Bennington St", + u"02128", + u"42.386", + u"-71.008", + u"Citizens Connect App", + ), + ] # noqa + print(self._get_column_names(Session, "test1")) + assert self._get_column_names(Session, "test1") == [ + u"_id", + u"_full_text", + u"CASE_ENQUIRY_ID", + u"open_dt", + u"target_dt", + u"closed_dt", + u"OnTime_Status", + u"CASE_STATUS", + u"CLOSURE_REASON", + u"CASE_TITLE", + u"SUBJECT", + u"REASON", + u"TYPE", + u"QUEUE", + u"Department", + u"SubmittedPhoto", + u"ClosedPhoto", + u"Location", + u"Fire_district", + u"pwd_district", + u"city_council_district", + u"police_district", + u"neighborhood", + u"neighborhood_services_district", + u"ward", + u"precinct", + u"LOCATION_STREET_NAME", + u"LOCATION_ZIPCODE", + u"Latitude", + u"Longitude", + u"Source", + ] # noqa + print(self._get_column_types(Session, "test1")) + assert self._get_column_types(Session, "test1") == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) + + def test_brazilian(self, Session): + csv_filepath = get_sample_filepath("brazilian_sample.csv") + resource_id = "test1" factories.Resource(id=resource_id) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - - records = self._get_records('test1') - print records - assert_equal( - records[0], - (1, u'01/01/1996 12:00:00 AM', u'1100015', u"ALTA FLORESTA D'OESTE", u'RO', None, u'128', u'0', u'8', u'119', u'1', u'0', u'3613', u'3051', u'130', u'7', u'121', u'3716', u'3078', u'127', u'7', None, None, None, None, u'6794', u'5036', u'1758', None, None, None, None, None, None, u'337', u'0.26112759', u'0.17210683', u'0.43323442', u'0.13353115', u'24.833692447908199', None, None, u'22.704964', u'67.080006197818605', u'65.144188573097907', u'74.672390253375497', u'16.7913561569619', u'19.4894563570641', u'8.649237411458509', u'7.60165422117368', u'11.1540090366186', u'17.263407056738099', u'8.5269823', u'9.2213373', u'5.3085136', u'52.472769803217503', None, None, None, None, None, None, u'25.0011414302354', u'22.830887000000001', u'66.8150490097632', u'64.893674212235595', u'74.288246611754104', u'17.0725384713319', u'19.8404105332814', u'8.856561911292371', u'7.74275834336647', u'11.357671741889', u'17.9410577459881', u'8.3696527', u'8.9979973', u'5.0570836', u'53.286314230720798', None, None, None, None, None, u'122988', None, u'10.155015000000001', u'14.826086999999999', u'11.671533', u'9.072917', None, None, None, None, None, None, None, None)) # noqa - print self._get_column_names('test1') - assert_equal( - self._get_column_names('test1'), - [u'_id', u'_full_text', u'NU_ANO_CENSO', u'CO_MUNICIPIO', u'MUNIC', u'SIGLA', u'CO_UF', u'SCHOOLS_NU', u'SCHOOLS_FED_NU', u'SCHOOLS_ESTADUAL_NU', u'SCHOOLS_MUN_NU', u'SCHOOLS_PRIV_NU', u'SCHOOLS_FED_STUD', u'SCHOOLS_ESTADUAL_STUD', u'SCHOOLS_MUN_STUD', u'SCHOOLS_PRIV_STUD', u'SCHOOLS_URBAN_NU', u'SCHOOLS_RURAL_NU', u'SCHOOLS_URBAN_STUD', u'SCHOOLS_RURAL_STUD', u'SCHOOLS_NIVFUND_1_NU', u'SCHOOLS_NIVFUND_2_NU', u'SCHOOLS_EIGHTYEARS_NU', u'SCHOOLS_NINEYEARS_NU', u'SCHOOLS_EIGHTYEARS_STUD', u'SCHOOLS_NINEYEARS_STUD', u'MATFUND_NU', u'MATFUND_I_NU', u'MATFUND_T_NU', u'SCHOOLS_INTERNET_AVG', u'SCHOOLS_WATER_PUBLIC_AVG', u'SCHOOLS_WATER_AVG', u'SCHOOLS_ELECTR_PUB_AVG', u'SCHOOLS_SEWAGE_PUB_AVG', u'SCHOOLS_SEWAGE_AVG', u'PROFFUNDTOT_NU', u'PROFFUNDINC_PC', u'PROFFUNDCOMP_PC', u'PROFMED_PC', u'PROFSUP_PC', u'CLASSSIZE', u'CLASSSIZE_I', u'CLASSSIZE_T', u'STUDTEACH', u'RATE_APROV', u'RATE_APROV_I', u'RATE_APROV_T', u'RATE_FAILURE', u'RATE_FAILURE_I', u'RATE_FAILURE_T', u'RATE_ABANDON', u'RATE_ABANDON_I', u'RATE_ABANDON_T', u'RATE_TRANSFER', u'RATE_TRANSFER_I', u'RATE_TRANSFER_T', u'RATE_OVERAGE', u'RATE_OVERAGE_I', u'RATE_OVERAGE_T', u'PROVA_MEAN_PORT_I', u'PROVA_MEAN_PORT_T', u'PROVA_MEAN_MAT_I', u'PROVA_MEAN_MAT_T', u'CLASSSIZE_PUB', u'STUDTEACH_PUB', u'RATE_APROV_PUB', u'RATE_APROV_I_PUB', u'RATE_APROV_T_PUB', u'RATE_FAILURE_PUB', u'RATE_FAILURE_I_PUB', u'RATE_FAILURE_T_PUB', u'RATE_ABANDON_PUB', u'RATE_ABANDON_I_PUB', u'RATE_ABANDON_T_PUB', u'RATE_TRANSFER_PUB', u'RATE_TRANSFER_I_PUB', u'RATE_TRANSFER_T_PUB', u'RATE_OVERAGE_PUB', u'RATE_OVERAGE_I_PUB', u'RATE_OVERAGE_T_PUB', u'PROVA_MEAN_PORT_I_PUB', u'PROVA_MEAN_PORT_T_PUB', u'PROVA_MEAN_MAT_I_PUB', u'PROFFUNDTOT_NU_PUB', u'PROVA_MEAN_MAT_T_PUB', u'EDUCTEACH_PUB', u'EDUCTEACH_FEDERAL', u'EDUCTEACH_STATE', u'EDUCTEACH_MUN', u'PROVA_MEAN_PORT_I_STATE', u'PROVA_MEAN_PORT_T_STATE', u'PROVA_MEAN_MAT_I_STATE', u'PROVA_MEAN_MAT_T_STATE', u'PROVA_MEAN_PORT_I_MUN', u'PROVA_MEAN_PORT_T_MUN', u'PROVA_MEAN_MAT_I_MUN', u'PROVA_MEAN_MAT_T_MUN']) # noqa - print self._get_column_types('test1') - assert_equal(self._get_column_types('test1'), - [u'int4', u'tsvector'] + - [u'text'] * (len(records[0]) - 1)) - - def test_german(self): - csv_filepath = get_sample_filepath('german_sample.csv') - resource_id = 'test_german' + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + + records = self._get_records(Session, "test1") + print(records) + assert records[0] == ( + 1, + u"01/01/1996 12:00:00 AM", + u"1100015", + u"ALTA FLORESTA D'OESTE", + u"RO", + None, + u"128", + u"0", + u"8", + u"119", + u"1", + u"0", + u"3613", + u"3051", + u"130", + u"7", + u"121", + u"3716", + u"3078", + u"127", + u"7", + None, + None, + None, + None, + u"6794", + u"5036", + u"1758", + None, + None, + None, + None, + None, + None, + u"337", + u"0.26112759", + u"0.17210683", + u"0.43323442", + u"0.13353115", + u"24.833692447908199", + None, + None, + u"22.704964", + u"67.080006197818605", + u"65.144188573097907", + u"74.672390253375497", + u"16.7913561569619", + u"19.4894563570641", + u"8.649237411458509", + u"7.60165422117368", + u"11.1540090366186", + u"17.263407056738099", + u"8.5269823", + u"9.2213373", + u"5.3085136", + u"52.472769803217503", + None, + None, + None, + None, + None, + None, + u"25.0011414302354", + u"22.830887000000001", + u"66.8150490097632", + u"64.893674212235595", + u"74.288246611754104", + u"17.0725384713319", + u"19.8404105332814", + u"8.856561911292371", + u"7.74275834336647", + u"11.357671741889", + u"17.9410577459881", + u"8.3696527", + u"8.9979973", + u"5.0570836", + u"53.286314230720798", + None, + None, + None, + None, + None, + u"122988", + None, + u"10.155015000000001", + u"14.826086999999999", + u"11.671533", + u"9.072917", + None, + None, + None, + None, + None, + None, + None, + None, + ) # noqa + print(self._get_column_names(Session, "test1")) + assert self._get_column_names(Session, "test1") == [ + u"_id", + u"_full_text", + u"NU_ANO_CENSO", + u"CO_MUNICIPIO", + u"MUNIC", + u"SIGLA", + u"CO_UF", + u"SCHOOLS_NU", + u"SCHOOLS_FED_NU", + u"SCHOOLS_ESTADUAL_NU", + u"SCHOOLS_MUN_NU", + u"SCHOOLS_PRIV_NU", + u"SCHOOLS_FED_STUD", + u"SCHOOLS_ESTADUAL_STUD", + u"SCHOOLS_MUN_STUD", + u"SCHOOLS_PRIV_STUD", + u"SCHOOLS_URBAN_NU", + u"SCHOOLS_RURAL_NU", + u"SCHOOLS_URBAN_STUD", + u"SCHOOLS_RURAL_STUD", + u"SCHOOLS_NIVFUND_1_NU", + u"SCHOOLS_NIVFUND_2_NU", + u"SCHOOLS_EIGHTYEARS_NU", + u"SCHOOLS_NINEYEARS_NU", + u"SCHOOLS_EIGHTYEARS_STUD", + u"SCHOOLS_NINEYEARS_STUD", + u"MATFUND_NU", + u"MATFUND_I_NU", + u"MATFUND_T_NU", + u"SCHOOLS_INTERNET_AVG", + u"SCHOOLS_WATER_PUBLIC_AVG", + u"SCHOOLS_WATER_AVG", + u"SCHOOLS_ELECTR_PUB_AVG", + u"SCHOOLS_SEWAGE_PUB_AVG", + u"SCHOOLS_SEWAGE_AVG", + u"PROFFUNDTOT_NU", + u"PROFFUNDINC_PC", + u"PROFFUNDCOMP_PC", + u"PROFMED_PC", + u"PROFSUP_PC", + u"CLASSSIZE", + u"CLASSSIZE_I", + u"CLASSSIZE_T", + u"STUDTEACH", + u"RATE_APROV", + u"RATE_APROV_I", + u"RATE_APROV_T", + u"RATE_FAILURE", + u"RATE_FAILURE_I", + u"RATE_FAILURE_T", + u"RATE_ABANDON", + u"RATE_ABANDON_I", + u"RATE_ABANDON_T", + u"RATE_TRANSFER", + u"RATE_TRANSFER_I", + u"RATE_TRANSFER_T", + u"RATE_OVERAGE", + u"RATE_OVERAGE_I", + u"RATE_OVERAGE_T", + u"PROVA_MEAN_PORT_I", + u"PROVA_MEAN_PORT_T", + u"PROVA_MEAN_MAT_I", + u"PROVA_MEAN_MAT_T", + u"CLASSSIZE_PUB", + u"STUDTEACH_PUB", + u"RATE_APROV_PUB", + u"RATE_APROV_I_PUB", + u"RATE_APROV_T_PUB", + u"RATE_FAILURE_PUB", + u"RATE_FAILURE_I_PUB", + u"RATE_FAILURE_T_PUB", + u"RATE_ABANDON_PUB", + u"RATE_ABANDON_I_PUB", + u"RATE_ABANDON_T_PUB", + u"RATE_TRANSFER_PUB", + u"RATE_TRANSFER_I_PUB", + u"RATE_TRANSFER_T_PUB", + u"RATE_OVERAGE_PUB", + u"RATE_OVERAGE_I_PUB", + u"RATE_OVERAGE_T_PUB", + u"PROVA_MEAN_PORT_I_PUB", + u"PROVA_MEAN_PORT_T_PUB", + u"PROVA_MEAN_MAT_I_PUB", + u"PROFFUNDTOT_NU_PUB", + u"PROVA_MEAN_MAT_T_PUB", + u"EDUCTEACH_PUB", + u"EDUCTEACH_FEDERAL", + u"EDUCTEACH_STATE", + u"EDUCTEACH_MUN", + u"PROVA_MEAN_PORT_I_STATE", + u"PROVA_MEAN_PORT_T_STATE", + u"PROVA_MEAN_MAT_I_STATE", + u"PROVA_MEAN_MAT_T_STATE", + u"PROVA_MEAN_PORT_I_MUN", + u"PROVA_MEAN_PORT_T_MUN", + u"PROVA_MEAN_MAT_I_MUN", + u"PROVA_MEAN_MAT_T_MUN", + ] # noqa + print(self._get_column_types(Session, "test1")) + assert self._get_column_types(Session, "test1") == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) + + def test_german(self, Session): + csv_filepath = get_sample_filepath("german_sample.csv") + resource_id = "test_german" factories.Resource(id=resource_id) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - - records = self._get_records('test_german') - print records - assert_equal( - records[0], - (1, u'Zürich', u'68260', u'65444', u'62646', u'6503', u'28800', u'1173', u'6891', u'24221', u'672') - ) - print self._get_column_names('test_german') - assert_equal( - self._get_column_names('test_german'), - [ - u'_id', - u'_full_text', - u'Stadtname', - u'Schuler_Total_2010/2011', - u'Schuler_Total_2000/2001', - u'Schuler_Total_1990/1991', - u'Schuler_Vorschule_2010/2011', - u'Schuler_Obligatorische Primar- und Sekundarstufe I_2010/2011', - u'Schuler_Sekundarstufe II, Ubergangsausbildung Sek I. - Sek. II_', - u'Schuler_Maturitatsschulen_2010/2011', - u'Schuler_Berufsausbildung_2010/2011', - u'Schuler_andere allgemeinbildende Schulen_2010/2011', - ] - ) - print self._get_column_types('test_german') - assert_equal( - self._get_column_types('test_german'), - [u'int4', u'tsvector'] + - [u'text'] * (len(records[0]) - 1) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), ) + records = self._get_records(Session, "test_german") + print(records) + assert records[0] == ( + 1, + u"Zürich", + u"68260", + u"65444", + u"62646", + u"6503", + u"28800", + u"1173", + u"6891", + u"24221", + u"672", + ) + print(self._get_column_names(Session, "test_german")) + assert self._get_column_names(Session, "test_german") == [ + u"_id", + u"_full_text", + u"Stadtname", + u"Schuler_Total_2010/2011", + u"Schuler_Total_2000/2001", + u"Schuler_Total_1990/1991", + u"Schuler_Vorschule_2010/2011", + u"Schuler_Obligatorische Primar- und Sekundarstufe I_2010/2011", + u"Schuler_Sekundarstufe II, Ubergangsausbildung Sek I. - Sek. II_", + u"Schuler_Maturitatsschulen_2010/2011", + u"Schuler_Berufsausbildung_2010/2011", + u"Schuler_andere allgemeinbildende Schulen_2010/2011", + ] + print(self._get_column_types(Session, "test_german")) + assert self._get_column_types(Session, "test_german") == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) + def test_integer_header_xlsx(self): # this xlsx file's header is detected by messytables.headers_guess as # integers and we should cope with that - csv_filepath = get_sample_filepath('go-realtime.xlsx') - resource_id = factories.Resource()['id'] + csv_filepath = get_sample_filepath("go-realtime.xlsx") + resource_id = factories.Resource()["id"] try: - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='CSV', logger=PrintLogger()) - except LoaderError as e: - # it should fail at the COPY stage - assert 'Error during the load into PostgreSQL: invalid byte ' \ - 'sequence for encoding' in str(e) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="CSV", + logger=PrintLogger(), + ) + except (LoaderError, UnicodeDecodeError) as e: + pass else: - assert 0, 'There should have been an exception' + assert 0, "There should have been an exception" - def test_reload(self): - csv_filepath = get_sample_filepath('simple.csv') - resource_id = 'test1' + def test_reload(self, Session): + csv_filepath = get_sample_filepath("simple.csv") + resource_id = "test1" factories.Resource(id=resource_id) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) # Load it again unchanged - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - - assert_equal(len(self._get_records('test1')), 6) - assert_equal( - self._get_column_names('test1'), - [u'_id', u'_full_text', u'date', u'temperature', u'place']) - assert_equal( - self._get_column_types('test1'), - [u'int4', u'tsvector', u'text', u'text', u'text']) - - def test_reload_with_overridden_types(self): - if not p.toolkit.check_ckan_version(min_version='2.7'): - raise SkipTest('Requires CKAN 2.7 - see https://github.com/ckan/ckan/pull/3557') - csv_filepath = get_sample_filepath('simple.csv') - resource_id = 'test1' + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + + assert len(self._get_records(Session, "test1")) == 6 + assert self._get_column_names(Session, "test1") == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + assert self._get_column_types(Session, "test1") == [ + u"int4", + u"tsvector", + u"text", + u"text", + u"text", + ] + + @pytest.mark.skipif( + not p.toolkit.check_ckan_version(min_version="2.7"), + reason="Requires CKAN 2.7 - see https://github.com/ckan/ckan/pull/3557", + ) + def test_reload_with_overridden_types(self, Session): + csv_filepath = get_sample_filepath("simple.csv") + resource_id = "test1" factories.Resource(id=resource_id) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) # Change types, as it would be done by Data Dictionary - rec = p.toolkit.get_action('datastore_search')(None, { - 'resource_id': resource_id, - 'limit': 0}) - fields = [f for f in rec['fields'] if not f['id'].startswith('_')] - fields[0]['info'] = {'type_override': 'timestamp'} - fields[1]['info'] = {'type_override': 'numeric'} - p.toolkit.get_action('datastore_create')({'ignore_auth': True}, { - 'resource_id': resource_id, - 'force': True, - 'fields': fields - }) - # [{ - # 'id': f['id'], - # 'type': f['type'], - # 'info': fi if isinstance(fi, dict) else {} - # } for f, fi in izip_longest(fields, info)] + rec = p.toolkit.get_action("datastore_search")( + None, {"resource_id": resource_id, "limit": 0} + ) + fields = [f for f in rec["fields"] if not f["id"].startswith("_")] + fields[0]["info"] = {"type_override": "timestamp"} + fields[1]["info"] = {"type_override": "numeric"} + p.toolkit.get_action("datastore_create")( + {"ignore_auth": True}, + {"resource_id": resource_id, "force": True, "fields": fields}, + ) # Load it again with new types - fields = loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - loader.create_column_indexes(fields=fields, resource_id=resource_id, - logger=PrintLogger()) - - assert_equal(len(self._get_records('test1')), 6) - assert_equal( - self._get_column_names('test1'), - [u'_id', u'_full_text', u'date', u'temperature', u'place']) - assert_equal( - self._get_column_types('test1'), - [u'int4', u'tsvector', u'timestamp', u'numeric', u'text']) + fields = loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + loader.create_column_indexes( + fields=fields, resource_id=resource_id, logger=PrintLogger() + ) + + assert len(self._get_records(Session, "test1")) == 6 + assert self._get_column_names(Session, "test1") == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + assert self._get_column_types(Session, "test1") == [ + u"int4", + u"tsvector", + u"timestamp", + u"numeric", + u"text", + ] # check that rows with nulls are indexed correctly - records = self._get_records('test1', exclude_full_text_column=False) - print records - assert_equal( - records[4][1], - "'berkeley':1" - ) - assert_equal( - records[5][1], - "'-01':2 '-03':3 '00':4,5,6 '2011':1 '5':7" - ) + records = self._get_records( + Session, "test1", exclude_full_text_column=False + ) + print(records) + assert records[4][1] == "'berkeley':1" + assert records[5][1] == "'-01':2 '-03':3 '00':4,5,6 '2011':1 '5':7" def test_encode_headers(self): - test_string_headers = [u'id', u'namé'] - test_float_headers = [u'id', u'näme', 2.0] - test_int_headers = [u'id', u'nóm', 3] + test_string_headers = [u"id", u"namé"] + test_float_headers = [u"id", u"näme", 2.0] + test_int_headers = [u"id", u"nóm", 3] test_result_string_headers = loader.encode_headers(test_string_headers) test_result_float_headers = loader.encode_headers(test_float_headers) test_result_int_headers = loader.encode_headers(test_int_headers) - assert_in('id', test_result_string_headers) - assert_in('name', test_result_string_headers) - assert_in('id', test_result_float_headers) - assert_in('name', test_result_float_headers) - assert_in('2.0', test_result_float_headers) - assert_in('id', test_result_int_headers) - assert_in('nom', test_result_int_headers) - assert_in('3', test_result_int_headers) - - def test_column_names(self): - csv_filepath = get_sample_filepath('column_names.csv') - resource_id = 'test1' + assert "id" in test_result_string_headers + assert "name" in test_result_string_headers + assert "id" in test_result_float_headers + assert "name" in test_result_float_headers + assert "2.0" in test_result_float_headers + assert "id" in test_result_int_headers + assert "nom" in test_result_int_headers + assert "3" in test_result_int_headers + + def test_column_names(self, Session): + csv_filepath = get_sample_filepath("column_names.csv") + resource_id = "test1" factories.Resource(id=resource_id) - loader.load_csv(csv_filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) - assert_equal( - self._get_column_names('test1')[2:], - [u'd@t$e', u't^e&m*pe!r(a)t?u:r%%e', ur'p\l/a[c{e%']) - assert_equal(self._get_records('test1')[0], - (1, u'2011-01-01', u'1', u'Galway')) + assert self._get_column_names(Session, "test1")[2:] == [ + u"d@t$e", + u"t^e&m*pe!r(a)t?u:r%%e", + r"p\l/a[c{e%", + ] + assert self._get_records(Session, "test1")[0] == ( + 1, + u"2011-01-01", + u"1", + u"Galway", + ) class TestLoadUnhandledTypes(TestLoadBase): - def test_kml(self): - filepath = get_sample_filepath('polling_locations.kml') - resource_id = 'test1' + filepath = get_sample_filepath("polling_locations.kml") + resource_id = "test1" factories.Resource(id=resource_id) - with assert_raises(LoaderError) as exception: - loader.load_csv(filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - assert_in('Error with field definition', - str(exception.exception)) - assert_in('"" is not a valid field name', - str(exception.exception)) + with pytest.raises(LoaderError) as exception: + loader.load_csv( + filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + assert "Error with field definition" in str(exception.value) + assert ( + '"" is not a valid field name' + in str(exception.value) + ) def test_geojson(self): - filepath = get_sample_filepath('polling_locations.geojson') - resource_id = 'test1' + filepath = get_sample_filepath("polling_locations.geojson") + resource_id = "test1" factories.Resource(id=resource_id) - with assert_raises(LoaderError) as exception: - loader.load_csv(filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - assert_in('Error with field definition', - str(exception.exception)) - assert_in('"{"type":"FeatureCollection"" is not a valid field name', - str(exception.exception)) + with pytest.raises(LoaderError) as exception: + loader.load_csv( + filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) + assert "Error with field definition" in str(exception.value) + assert ( + '"{"type":"FeatureCollection"" is not a valid field name' + in str(exception.value) + ) def test_shapefile_zip(self): - filepath = get_sample_filepath('polling_locations.shapefile.zip') - resource_id = 'test1' + filepath = get_sample_filepath("polling_locations.shapefile.zip") + resource_id = "test1" factories.Resource(id=resource_id) - with assert_raises(LoaderError) as exception: - loader.load_csv(filepath, resource_id=resource_id, - mimetype='text/csv', logger=PrintLogger()) - assert_in('Error during the load into PostgreSQL: ' - 'unquoted carriage return found in data', - str(exception.exception)) + with pytest.raises(LoaderError) as exception: + loader.load_csv( + filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=PrintLogger(), + ) class TestLoadMessytables(TestLoadBase): - - def test_simple(self): - csv_filepath = get_sample_filepath('simple.xls') - resource_id = 'test1' + def test_simple(self, Session): + csv_filepath = get_sample_filepath("simple.xls") + resource_id = "test1" factories.Resource(id=resource_id) - loader.load_table(csv_filepath, resource_id=resource_id, - mimetype='xls', logger=PrintLogger()) + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="xls", + logger=PrintLogger(), + ) - assert_in( - "'galway':", - self._get_records('test1', limit=1, exclude_full_text_column=False)[0][1]) + assert ( + "'galway':" + in self._get_records( + Session, "test1", limit=1, exclude_full_text_column=False + )[0][1] + ) # Indexed record looks like this (depending on CKAN version?): # "'-01':2,3 '00':4,5,6 '1':7 '2011':1 'galway':8" # "'-01':4,5 '00':6,7,8 '1':1 '2011':3 'galway':2" # "'-01':2,3 '00':5,6 '1':7 '2011':1 'galway':8 't00':4" - assert_equal( - self._get_records('test1'), - [(1, datetime.datetime(2011, 1, 1, 0, 0), Decimal('1'), u'Galway'), - (2, datetime.datetime(2011, 1, 2, 0, 0), Decimal('-1'), u'Galway'), - (3, datetime.datetime(2011, 1, 3, 0, 0), Decimal('0'), u'Galway'), - (4, datetime.datetime(2011, 1, 1, 0, 0), Decimal('6'), u'Berkeley'), - (5, datetime.datetime(2011, 1, 2, 0, 0), Decimal('8'), u'Berkeley'), - (6, datetime.datetime(2011, 1, 3, 0, 0), Decimal('5'), u'Berkeley')]) - assert_equal( - self._get_column_names('test1'), - [u'_id', u'_full_text', u'date', u'temperature', u'place']) - assert_equal( - self._get_column_types('test1'), - [u'int4', u'tsvector', u'timestamp', u'numeric', u'text']) + assert self._get_records(Session, "test1") == [ + (1, datetime.datetime(2011, 1, 1, 0, 0), Decimal("1"), u"Galway",), + ( + 2, + datetime.datetime(2011, 1, 2, 0, 0), + Decimal("-1"), + u"Galway", + ), + (3, datetime.datetime(2011, 1, 3, 0, 0), Decimal("0"), u"Galway",), + ( + 4, + datetime.datetime(2011, 1, 1, 0, 0), + Decimal("6"), + u"Berkeley", + ), + ( + 5, + datetime.datetime(2011, 1, 2, 0, 0), + Decimal("8"), + u"Berkeley", + ), + ( + 6, + datetime.datetime(2011, 1, 3, 0, 0), + Decimal("5"), + u"Berkeley", + ), + ] + assert self._get_column_names(Session, "test1") == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + assert self._get_column_types(Session, "test1") == [ + u"int4", + u"tsvector", + u"timestamp", + u"numeric", + u"text", + ] # test disabled by default to avoid adding large file to repo and slow test - @nottest + @pytest.mark.skip def test_boston_311_complete(self): # to get the test file: # curl -o ckanext/xloader/tests/samples/boston_311.csv https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2968e2c0-d479-49ba-a884-4ef523ada3c0/download/311.csv # noqa - csv_filepath = get_sample_filepath('boston_311.csv') - resource_id = 'test1' + csv_filepath = get_sample_filepath("boston_311.csv") + resource_id = "test1" factories.Resource(id=resource_id) import time + t0 = time.time() - print '{} Start load'.format(time.strftime('%H:%M:%S', time.localtime(t0))) - loader.load_table(csv_filepath, resource_id=resource_id, - mimetype='csv', logger=PrintLogger()) - print 'Load: {}s'.format(time.time() - t0) + print( + "{} Start load".format( + time.strftime("%H:%M:%S", time.localtime(t0)) + ) + ) + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=PrintLogger(), + ) + print("Load: {}s".format(time.time() - t0)) # test disabled by default to avoid adding large file to repo and slow test - @nottest + @pytest.mark.skip def test_boston_311_sample5(self): # to create the test file: # head -n 100001 ckanext/xloader/tests/samples/boston_311.csv > ckanext/xloader/tests/samples/boston_311_sample5.csv - csv_filepath = get_sample_filepath('boston_311_sample5.csv') - resource_id = 'test1' + csv_filepath = get_sample_filepath("boston_311_sample5.csv") + resource_id = "test1" factories.Resource(id=resource_id) import time + t0 = time.time() - print '{} Start load'.format(time.strftime('%H:%M:%S', time.localtime(t0))) - loader.load_table(csv_filepath, resource_id=resource_id, - mimetype='csv', logger=PrintLogger()) - print 'Load: {}s'.format(time.time() - t0) - - def test_boston_311(self): - csv_filepath = get_sample_filepath('boston_311_sample.csv') - resource_id = 'test1' - factories.Resource(id=resource_id) - loader.load_table(csv_filepath, resource_id=resource_id, - mimetype='csv', logger=PrintLogger()) - - records = self._get_records('test1') - print records - assert_equal( - records, - [(1, Decimal('101002153891'), datetime.datetime(2017, 7, 6, 23, 38, 43), datetime.datetime(2017, 7, 21, 8, 30), u'', u'ONTIME', u'Open', u' ', u'Street Light Outages', u'Public Works Department', u'Street Lights', u'Street Light Outages', u'PWDx_Street Light Outages', u'PWDx', u'', u'', u'480 Harvard St Dorchester MA 02124', Decimal('8'), Decimal('7'), Decimal('4'), u'B3', u'Greater Mattapan', Decimal('9'), u'Ward 14', Decimal('1411'), u'480 Harvard St', Decimal('2124'), Decimal('42.288'), Decimal('-71.0927'), u'Citizens Connect App'), # noqa - (2, Decimal('101002153890'), datetime.datetime(2017, 7, 6, 23, 29, 13), datetime.datetime(2017, 9, 11, 8, 30), u'', u'ONTIME', u'Open', u' ', u'Graffiti Removal', u'Property Management', u'Graffiti', u'Graffiti Removal', u'PROP_GRAF_GraffitiRemoval', u'PROP', u' https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg', u'', u'522 Saratoga St East Boston MA 02128', Decimal('1'), Decimal('9'), Decimal('1'), u'A7', u'East Boston', Decimal('1'), u'Ward 1', Decimal('110'), u'522 Saratoga St', Decimal('2128'), Decimal('42.3807'), Decimal('-71.0259'), u'Citizens Connect App'), # noqa - (3, Decimal('101002153889'), datetime.datetime(2017, 7, 6, 23, 24, 20), datetime.datetime(2017, 9, 11, 8, 30), u'', u'ONTIME', u'Open', u' ', u'Graffiti Removal', u'Property Management', u'Graffiti', u'Graffiti Removal', u'PROP_GRAF_GraffitiRemoval', u'PROP', u' https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg', u'', u'965 Bennington St East Boston MA 02128', Decimal('1'), Decimal('9'), Decimal('1'), u'A7', u'East Boston', Decimal('1'), u'Ward 1', Decimal('112'), u'965 Bennington St', Decimal('2128'), Decimal('42.386'), Decimal('-71.008'), u'Citizens Connect App')] # noqa + print( + "{} Start load".format( + time.strftime("%H:%M:%S", time.localtime(t0)) ) - print self._get_column_names('test1') - assert_equal( - self._get_column_names('test1'), - [u'_id', u'_full_text', u'CASE_ENQUIRY_ID', u'open_dt', u'target_dt', u'closed_dt', u'OnTime_Status', u'CASE_STATUS', u'CLOSURE_REASON', u'CASE_TITLE', u'SUBJECT', u'REASON', u'TYPE', u'QUEUE', u'Department', u'SubmittedPhoto', u'ClosedPhoto', u'Location', u'Fire_district', u'pwd_district', u'city_council_district', u'police_district', u'neighborhood', u'neighborhood_services_district', u'ward', u'precinct', u'LOCATION_STREET_NAME', u'LOCATION_ZIPCODE', u'Latitude', u'Longitude', u'Source']) # noqa - print self._get_column_types('test1') - assert_equal(self._get_column_types('test1'), - [u'int4', u'tsvector', - u'numeric', u'timestamp', u'timestamp', u'text', u'text', u'text', u'text', u'text', u'text', u'text', u'text', u'text', u'text', u'text', u'text', u'text', u'numeric', u'numeric', u'numeric', u'text', u'text', u'numeric', u'text', u'numeric', u'text', u'numeric', u'numeric', u'numeric', u'text']) # noqa + ) + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=PrintLogger(), + ) + print("Load: {}s".format(time.time() - t0)) + + def test_boston_311(self, Session): + csv_filepath = get_sample_filepath("boston_311_sample.csv") + resource_id = "test1" + factories.Resource(id=resource_id) + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=PrintLogger(), + ) + + records = self._get_records(Session, "test1") + print(records) + assert records == [ + ( + 1, + Decimal("101002153891"), + datetime.datetime(2017, 7, 6, 23, 38, 43), + datetime.datetime(2017, 7, 21, 8, 30), + u"", + u"ONTIME", + u"Open", + u" ", + u"Street Light Outages", + u"Public Works Department", + u"Street Lights", + u"Street Light Outages", + u"PWDx_Street Light Outages", + u"PWDx", + u"", + u"", + u"480 Harvard St Dorchester MA 02124", + Decimal("8"), + Decimal("7"), + Decimal("4"), + u"B3", + u"Greater Mattapan", + Decimal("9"), + u"Ward 14", + Decimal("1411"), + u"480 Harvard St", + Decimal("2124"), + Decimal("42.288"), + Decimal("-71.0927"), + u"Citizens Connect App", + ), # noqa + ( + 2, + Decimal("101002153890"), + datetime.datetime(2017, 7, 6, 23, 29, 13), + datetime.datetime(2017, 9, 11, 8, 30), + u"", + u"ONTIME", + u"Open", + u" ", + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", + u"", + u"522 Saratoga St East Boston MA 02128", + Decimal("1"), + Decimal("9"), + Decimal("1"), + u"A7", + u"East Boston", + Decimal("1"), + u"Ward 1", + Decimal("110"), + u"522 Saratoga St", + Decimal("2128"), + Decimal("42.3807"), + Decimal("-71.0259"), + u"Citizens Connect App", + ), # noqa + ( + 3, + Decimal("101002153889"), + datetime.datetime(2017, 7, 6, 23, 24, 20), + datetime.datetime(2017, 9, 11, 8, 30), + u"", + u"ONTIME", + u"Open", + u" ", + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", + u"", + u"965 Bennington St East Boston MA 02128", + Decimal("1"), + Decimal("9"), + Decimal("1"), + u"A7", + u"East Boston", + Decimal("1"), + u"Ward 1", + Decimal("112"), + u"965 Bennington St", + Decimal("2128"), + Decimal("42.386"), + Decimal("-71.008"), + u"Citizens Connect App", + ), + ] # noqa + print(self._get_column_names(Session, "test1")) + assert self._get_column_names(Session, "test1") == [ + u"_id", + u"_full_text", + u"CASE_ENQUIRY_ID", + u"open_dt", + u"target_dt", + u"closed_dt", + u"OnTime_Status", + u"CASE_STATUS", + u"CLOSURE_REASON", + u"CASE_TITLE", + u"SUBJECT", + u"REASON", + u"TYPE", + u"QUEUE", + u"Department", + u"SubmittedPhoto", + u"ClosedPhoto", + u"Location", + u"Fire_district", + u"pwd_district", + u"city_council_district", + u"police_district", + u"neighborhood", + u"neighborhood_services_district", + u"ward", + u"precinct", + u"LOCATION_STREET_NAME", + u"LOCATION_ZIPCODE", + u"Latitude", + u"Longitude", + u"Source", + ] # noqa + print(self._get_column_types(Session, "test1")) + assert self._get_column_types(Session, "test1") == [ + u"int4", + u"tsvector", + u"numeric", + u"timestamp", + u"timestamp", + u"text", + u"text", + u"text", + u"text", + u"text", + u"text", + u"text", + u"text", + u"text", + u"text", + u"text", + u"text", + u"text", + u"numeric", + u"numeric", + u"numeric", + u"text", + u"text", + u"numeric", + u"text", + u"numeric", + u"text", + u"numeric", + u"numeric", + u"numeric", + u"text", + ] # noqa def test_no_entries(self): - csv_filepath = get_sample_filepath('no_entries.csv') + csv_filepath = get_sample_filepath("no_entries.csv") # no datastore table is created - we need to except, or else # datastore_active will be set on a non-existent datastore table - resource_id = 'test1' + resource_id = "test1" factories.Resource(id=resource_id) - with assert_raises(LoaderError): - loader.load_table(csv_filepath, resource_id=resource_id, - mimetype='csv', logger=PrintLogger()) + with pytest.raises(LoaderError): + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=PrintLogger(), + ) diff --git a/ckanext/xloader/tests/test_plugin.py b/ckanext/xloader/tests/test_plugin.py index 604fb725..fd7b9fba 100644 --- a/ckanext/xloader/tests/test_plugin.py +++ b/ckanext/xloader/tests/test_plugin.py @@ -1,279 +1,66 @@ import datetime - -# from nose.tools import eq_ -# import mock - +import pytest +import mock import ckan.plugins as p from ckan.tests import helpers, factories +from ckan.logic import _actions +@pytest.mark.usefixtures("clean_db", "with_plugins") +@pytest.mark.ckan_config("ckan.plugins", "datastore xloader") class TestNotify(object): + def test_submit_on_resource_create(self, monkeypatch): + func = mock.Mock() + monkeypatch.setitem(_actions, "xloader_submit", func) - @classmethod - def setup_class(cls): - if not p.plugin_loaded('datastore'): - p.load('datastore') - if not p.plugin_loaded('xloader'): - p.load('xloader') - - helpers.reset_db() - - @classmethod - def teardown_class(cls): - - p.unload('xloader') - p.unload('datastore') - - helpers.reset_db() - - @helpers.mock_action('xloader_submit') - def test_submit_on_resource_create(self, mock_xloader_submit): dataset = factories.Dataset() - assert not mock_xloader_submit.called + assert not func.called + + helpers.call_action( + "resource_create", + {}, + package_id=dataset["id"], + url="http://example.com/file.csv", + format="CSV", + ) - helpers.call_action('resource_create', {}, - package_id=dataset['id'], - url='http://example.com/file.csv', - format='CSV') + assert func.called - assert mock_xloader_submit.called + def test_submit_when_url_changes(self, monkeypatch): + func = mock.Mock() + monkeypatch.setitem(_actions, "xloader_submit", func) - @helpers.mock_action('xloader_submit') - def test_submit_when_url_changes(self, mock_xloader_submit): dataset = factories.Dataset() - resource = helpers.call_action('resource_create', {}, - package_id=dataset['id'], - url='http://example.com/file.pdf', - ) + resource = helpers.call_action( + "resource_create", + {}, + package_id=dataset["id"], + url="http://example.com/file.pdf", + ) - assert not mock_xloader_submit.called # because of the format being PDF + assert not func.called # because of the format being PDF - helpers.call_action('resource_update', {}, - id=resource['id'], - package_id=dataset['id'], - url='http://example.com/file.csv', - format='CSV' - ) + helpers.call_action( + "resource_update", + {}, + id=resource["id"], + package_id=dataset["id"], + url="http://example.com/file.csv", + format="CSV", + ) - assert mock_xloader_submit.called + assert func.called def _pending_task(self, resource_id): return { - 'entity_id': resource_id, - 'entity_type': 'resource', - 'task_type': 'xloader', - 'last_updated': str(datetime.datetime.utcnow()), - 'state': 'pending', - 'key': 'xloader', - 'value': '{}', - 'error': '{}', + "entity_id": resource_id, + "entity_type": "resource", + "task_type": "xloader", + "last_updated": str(datetime.datetime.utcnow()), + "state": "pending", + "key": "xloader", + "value": "{}", + "error": "{}", } - - # @helpers.mock_action('xloader_submit') - # def test_does_not_submit_while_ongoing_job(self, mock_xloader_submit): - # dataset = factories.Dataset() - - # resource = helpers.call_action('resource_create', {}, - # package_id=dataset['id'], - # url='http://example.com/file.CSV', - # format='CSV' - # ) - - # assert mock_xloader_submit.called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Create a task with a state pending to mimic an ongoing job - # # on the xloader - # helpers.call_action('task_status_update', {}, - # **self._pending_task(resource['id'])) - - # # Update the resource - # helpers.call_action('resource_update', {}, - # id=resource['id'], - # package_id=dataset['id'], - # url='http://example.com/file.csv', - # format='CSV', - # description='Test', - # ) - # # Not called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # @helpers.mock_action('xloader_submit') - # def test_resubmits_if_url_changes_in_the_meantime( - # self, mock_xloader_submit): - # dataset = factories.Dataset() - - # resource = helpers.call_action('resource_create', {}, - # package_id=dataset['id'], - # url='http://example.com/file.csv', - # format='CSV' - # ) - - # assert mock_xloader_submit.called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Create a task with a state pending to mimic an ongoing job - # # on the xloader - # task = helpers.call_action('task_status_update', {}, - # **self._pending_task(resource['id'])) - - # # Update the resource, set a new URL - # helpers.call_action('resource_update', {}, - # id=resource['id'], - # package_id=dataset['id'], - # url='http://example.com/another.file.csv', - # format='CSV', - # ) - # # Not called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Call xloader_hook with state complete, to mock the xloader - # # finishing the job and telling CKAN - # data_dict = { - # 'metadata': { - # 'resource_id': resource['id'], - # 'original_url': 'http://example.com/file.csv', - # 'task_created': task['last_updated'], - # }, - # 'status': 'complete', - # } - # helpers.call_action('xloader_hook', {}, **data_dict) - - # # xloader_submit was called again - # eq_(len(mock_xloader_submit.mock_calls), 2) - - # @helpers.mock_action('xloader_submit') - # def test_resubmits_if_upload_changes_in_the_meantime( - # self, mock_xloader_submit): - # dataset = factories.Dataset() - - # resource = helpers.call_action('resource_create', {}, - # package_id=dataset['id'], - # url='http://example.com/file.csv', - # format='CSV' - # ) - - # assert mock_xloader_submit.called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Create a task with a state pending to mimic an ongoing job - # # on the xloader - # task = helpers.call_action('task_status_update', {}, - # **self._pending_task(resource['id'])) - - # # Update the resource, set a new last_modified (changes on file upload) - # helpers.call_action( - # 'resource_update', {}, - # id=resource['id'], - # package_id=dataset['id'], - # url='http://example.com/file.csv', - # format='CSV', - # last_modified=datetime.datetime.utcnow().isoformat() - # ) - # # Not called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Call xloader_hook with state complete, to mock the xloader - # # finishing the job and telling CKAN - # data_dict = { - # 'metadata': { - # 'resource_id': resource['id'], - # 'original_url': 'http://example.com/file.csv', - # 'task_created': task['last_updated'], - # }, - # 'status': 'complete', - # } - # helpers.call_action('xloader_hook', {}, **data_dict) - - # # xloader_submit was called again - # eq_(len(mock_xloader_submit.mock_calls), 2) - - # @helpers.mock_action('xloader_submit') - # def test_does_not_resubmit_if_a_resource_field_changes_in_the_meantime( - # self, mock_xloader_submit): - # dataset = factories.Dataset() - - # resource = helpers.call_action('resource_create', {}, - # package_id=dataset['id'], - # url='http://example.com/file.csv', - # format='CSV' - # ) - - # assert mock_xloader_submit.called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Create a task with a state pending to mimic an ongoing job - # # on the xloader - # task = helpers.call_action('task_status_update', {}, - # **self._pending_task(resource['id'])) - - # # Update the resource, set a new description - # helpers.call_action('resource_update', {}, - # id=resource['id'], - # package_id=dataset['id'], - # url='http://example.com/file.csv', - # format='CSV', - # description='Test', - # ) - # # Not called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Call xloader_hook with state complete, to mock the xloader - # # finishing the job and telling CKAN - # data_dict = { - # 'metadata': { - # 'resource_id': resource['id'], - # 'original_url': 'http://example.com/file.csv', - # 'task_created': task['last_updated'], - # }, - # 'status': 'complete', - # } - # helpers.call_action('xloader_hook', {}, **data_dict) - - # # Not called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # @helpers.mock_action('xloader_submit') - # def test_does_not_resubmit_if_a_dataset_field_changes_in_the_meantime( - # self, mock_xloader_submit): - # dataset = factories.Dataset() - - # resource = helpers.call_action('resource_create', {}, - # package_id=dataset['id'], - # url='http://example.com/file.csv', - # format='CSV' - # ) - - # assert mock_xloader_submit.called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Create a task with a state pending to mimic an ongoing job - # # on the xloader - # task = helpers.call_action('task_status_update', {}, - # **self._pending_task(resource['id'])) - - # # Update the parent dataset - # helpers.call_action('package_update', {}, - # id=dataset['id'], - # notes='Test notes', - # resources=[resource] - # ) - # # Not called - # eq_(len(mock_xloader_submit.mock_calls), 1) - - # # Call xloader_hook with state complete, to mock the xloader - # # finishing the job and telling CKAN - # data_dict = { - # 'metadata': { - # 'resource_id': resource['id'], - # 'original_url': 'http://example.com/file.csv', - # 'task_created': task['last_updated'], - # }, - # 'status': 'complete', - # } - # helpers.call_action('xloader_hook', {}, **data_dict) - - # # Not called - # eq_(len(mock_xloader_submit.mock_calls), 1) diff --git a/ckanext/xloader/tests/util.py b/ckanext/xloader/tests/util.py deleted file mode 100644 index 1ef12c31..00000000 --- a/ckanext/xloader/tests/util.py +++ /dev/null @@ -1,46 +0,0 @@ -import sqlalchemy -import sqlalchemy.orm as orm -import os - -from ckan.tests import helpers -from ckanext.datastore.tests import helpers as datastore_helpers -from ckanext.xloader.loader import get_write_engine - -__location__ = os.path.realpath( - os.path.join(os.getcwd(), os.path.dirname(__file__)) -) - - -class PluginsMixin(object): - @classmethod - def setup_class(cls): - import ckan.plugins as p - for plugin in getattr(cls, '_load_plugins', []): - if not p.plugin_loaded(plugin): - p.load(plugin) - helpers.reset_db() - reset_datastore_db() - add_full_text_trigger_function() - - @classmethod - def teardown_class(cls): - import ckan.plugins as p - for plugin in reversed(getattr(cls, '_load_plugins', [])): - p.unload(plugin) - helpers.reset_db() - - -def reset_datastore_db(): - engine = get_write_engine() - Session = orm.scoped_session(orm.sessionmaker(bind=engine)) - datastore_helpers.clear_db(Session) - - -def add_full_text_trigger_function(): - engine = get_write_engine() - Session = orm.scoped_session(orm.sessionmaker(bind=engine)) - c = Session.connection() - with open(os.path.join(__location__, '..', '..', '..', 'full_text_function.sql'), 'r') as full_text_sql: - c.execute(sqlalchemy.text(full_text_sql.read())) - Session.commit() - Session.remove() diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py new file mode 100644 index 00000000..962ca16a --- /dev/null +++ b/ckanext/xloader/utils.py @@ -0,0 +1,42 @@ +import ckan.plugins as p + + +def resource_data(id, resource_id): + + if p.toolkit.request.method == "POST": + try: + p.toolkit.c.pkg_dict = p.toolkit.get_action("xloader_submit")( + None, {"resource_id": resource_id} + ) + except p.toolkit.ValidationError: + pass + + return p.toolkit.redirect_to( + "xloader.resource_data", id=id, resource_id=resource_id + ) + + try: + p.toolkit.c.pkg_dict = p.toolkit.get_action("package_show")(None, {"id": id}) + p.toolkit.c.resource = p.toolkit.get_action("resource_show")( + None, {"id": resource_id} + ) + except (p.toolkit.ObjectNotFound, p.toolkit.NotAuthorized): + return p.toolkit.abort(404, p.toolkit._("Resource not found")) + + try: + xloader_status = p.toolkit.get_action("xloader_status")( + None, {"resource_id": resource_id} + ) + except p.toolkit.ObjectNotFound: + xloader_status = {} + except p.toolkit.NotAuthorized: + return p.toolkit.abort(403, p.toolkit._("Not authorized to see this page")) + + return p.toolkit.render( + "xloader/resource_data.html", + extra_vars={ + "status": xloader_status, + "resource": p.toolkit.c.resource, + "pkg_dict": p.toolkit.c.pkg_dict, + }, + ) diff --git a/ckanext/xloader/views.py b/ckanext/xloader/views.py new file mode 100644 index 00000000..198de320 --- /dev/null +++ b/ckanext/xloader/views.py @@ -0,0 +1,15 @@ +from flask import Blueprint + +import ckanext.xloader.utils as utils + + +xloader = Blueprint("xloader", __name__) + + +def get_blueprints(): + return [xloader] + + +@xloader.route("/dataset//resource_data/", methods=("GET", "POST")) +def resource_data(id, resource_id): + return utils.resource_data(id, resource_id) diff --git a/conftest.py b/conftest.py new file mode 100644 index 00000000..8258f311 --- /dev/null +++ b/conftest.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +pytest_plugins = [ + u'ckanext.xloader.tests.ckan_setup', + u'ckanext.xloader.tests.fixtures', +] diff --git a/dev-requirements.txt b/dev-requirements.txt index ceb025b1..0078e7d8 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,2 +1,4 @@ responses==0.8.1 flake8 +pytest~=4.6.5 +pytest-cov~=2.7.1 diff --git a/requirements.txt b/requirements.txt index 2089ca45..d0cc9627 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ messytables==0.15.2 Unidecode==1.0.22 +six diff --git a/setup.cfg b/setup.cfg index 6787b0fb..5e2fb6fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,4 +18,12 @@ previous = true [compile_catalog] domain = ckanext-xloader directory = ckanext/xloader/i18n -statistics = true \ No newline at end of file +statistics = true + +[tool:pytest] + +filterwarnings = + ignore::sqlalchemy.exc.SADeprecationWarning + ignore::sqlalchemy.exc.SAWarning + ignore::DeprecationWarning +addopts = --pdbcls=IPython.terminal.debugger:TerminalPdb \ No newline at end of file diff --git a/test.ini b/test.ini index d961f439..1415d37f 100644 --- a/test.ini +++ b/test.ini @@ -11,7 +11,7 @@ port = 5000 [app:main] use = config:../ckan/test-core.ini -solr_url = http://127.0.0.1:8983/solr +# solr_url = http://127.0.0.1:8983/solr # Insert any custom config settings to be used when running your extension's # tests here.