diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml index 717657bd..73120218 100644 --- a/ckanext/xloader/config_declaration.yaml +++ b/ckanext/xloader/config_declaration.yaml @@ -46,6 +46,15 @@ groups: type: bool required: false legacy_key: ckanext.xloader.just_load_with_messytables + - key: ckanext.xloader.strict_type_guessing + default: True + example: False + description: | + Use with ckanext.xloader.use_type_guessing to set strict true or false + for type guessing. If set to False, the types will always fallback to string type. + + Strict means that a type will not be guessed if parsing fails for a single cell in the column. + type: bool - key: ckanext.xloader.max_type_guessing_length default: 0 example: 100000 diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 85be3f34..0a583523 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -3,6 +3,7 @@ import datetime import itertools +from six import text_type as str, binary_type import os import os.path import tempfile @@ -376,7 +377,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): skip_rows = list(range(1, header_offset + 2)) TYPES, TYPE_MAPPING = get_types() - types = type_guess(stream.sample[1:], types=TYPES, strict=True) + strict_guessing = p.toolkit.asbool( + config.get('ckanext.xloader.strict_type_guessing', True)) + types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing) # override with types user requested if existing_info: @@ -454,12 +457,17 @@ def row_iterator(): _TYPE_MAPPING = { + "": 'text', "": 'text', + "": 'text', "": 'text', "": 'numeric', "": 'numeric', "": 'numeric', + "": 'timestamp', "": 'text', + "": 'text', + "": 'text', "": 'text', "": 'numeric', "": 'numeric', @@ -468,7 +476,7 @@ def row_iterator(): def get_types(): - _TYPES = [int, bool, str, datetime.datetime, float, Decimal] + _TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal] TYPE_MAPPING = config.get('TYPE_MAPPING', _TYPE_MAPPING) return _TYPES, TYPE_MAPPING diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py index 073a8091..db8ff06f 100644 --- a/ckanext/xloader/utils.py +++ b/ckanext/xloader/utils.py @@ -3,6 +3,8 @@ import json import datetime +from six import text_type as str, binary_type + from ckan import model from ckan.lib import search from collections import defaultdict @@ -24,6 +26,8 @@ "application/vnd.oasis.opendocument.spreadsheet", ] +from .job_exceptions import JobError + class XLoaderFormats(object): formats = None @@ -184,7 +188,7 @@ def headers_guess(rows, tolerance=1): return 0, [] -TYPES = [int, bool, str, datetime.datetime, float, Decimal] +TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal] def type_guess(rows, types=TYPES, strict=False): @@ -245,6 +249,8 @@ def type_guess(rows, types=TYPES, strict=False): # element in case of a tie # See: http://stackoverflow.com/a/6783101/214950 guesses_tuples = [(t, guess[t]) for t in types if t in guess] + if not guesses_tuples: + raise JobError('Failed to guess types') _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0]) return _columns