Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PY2 & PY3 String/Binary Fixes #203

Merged
merged 3 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions ckanext/xloader/config_declaration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ groups:
type: bool
required: false
legacy_key: ckanext.xloader.just_load_with_messytables
- key: ckanext.xloader.strict_type_guessing
default: True
example: False
description: |
Use with ckanext.xloader.use_type_guessing to set strict true or false
for type guessing. If set to False, the types will always fallback to string type.
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved

Strict means that a type will not be guessed if parsing fails for a single cell in the column.
type: bool
- key: ckanext.xloader.max_type_guessing_length
default: 0
example: 100000
Expand Down
12 changes: 10 additions & 2 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import datetime
import itertools
from six import text_type as str, binary_type
import os
import os.path
import tempfile
Expand Down Expand Up @@ -376,7 +377,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
skip_rows = list(range(1, header_offset + 2))

TYPES, TYPE_MAPPING = get_types()
types = type_guess(stream.sample[1:], types=TYPES, strict=True)
strict_guessing = p.toolkit.asbool(
config.get('ckanext.xloader.strict_type_guessing', True))
types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing)

# override with types user requested
if existing_info:
Expand Down Expand Up @@ -454,12 +457,17 @@ def row_iterator():


_TYPE_MAPPING = {
"<type 'str'>": 'text',
"<type 'unicode'>": 'text',
"<type 'bytes'>": 'text',
"<type 'bool'>": 'text',
"<type 'int'>": 'numeric',
"<type 'float'>": 'numeric',
"<class 'decimal.Decimal'>": 'numeric',
"<type 'datetime.datetime'>": 'timestamp',
"<class 'str'>": 'text',
"<class 'unicode'>": 'text',
"<class 'bytes'>": 'text',
"<class 'bool'>": 'text',
"<class 'int'>": 'numeric',
"<class 'float'>": 'numeric',
Expand All @@ -468,7 +476,7 @@ def row_iterator():


def get_types():
_TYPES = [int, bool, str, datetime.datetime, float, Decimal]
_TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal]
TYPE_MAPPING = config.get('TYPE_MAPPING', _TYPE_MAPPING)
return _TYPES, TYPE_MAPPING

Expand Down
8 changes: 7 additions & 1 deletion ckanext/xloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import json
import datetime

from six import text_type as str, binary_type

from ckan import model
from ckan.lib import search
from collections import defaultdict
Expand All @@ -24,6 +26,8 @@
"application/vnd.oasis.opendocument.spreadsheet",
]

from .job_exceptions import JobError


class XLoaderFormats(object):
formats = None
Expand Down Expand Up @@ -184,7 +188,7 @@ def headers_guess(rows, tolerance=1):
return 0, []


TYPES = [int, bool, str, datetime.datetime, float, Decimal]
TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal]


def type_guess(rows, types=TYPES, strict=False):
Expand Down Expand Up @@ -245,6 +249,8 @@ def type_guess(rows, types=TYPES, strict=False):
# element in case of a tie
# See: http://stackoverflow.com/a/6783101/214950
guesses_tuples = [(t, guess[t]) for t in types if t in guess]
if not guesses_tuples:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this handle files with only a header and no data rows?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ThrawnCA can you add a test for a header only upload please.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@duttonw I can't; the pull request doesn't originate from our repo.

raise JobError('Failed to guess types')
_columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
return _columns

Expand Down
Loading