Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for unicode header names #111

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,12 @@ Configuration:
# not be loaded into the datastore.
ckanext.xloader.max_excerpt_lines = 100

# If set to True allows unicode characters in header names.
# If set to False (default), characters are encoded to ascii
# using the unidecode library.
ckanext.xloader.unicode_headers = False


------------------------
Developer installation
------------------------
Expand Down
39 changes: 26 additions & 13 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def get_write_engine():
MAX_COLUMN_LENGTH = 63


def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None, unicode_headers=None):
OriHoch marked this conversation as resolved.
Show resolved Hide resolved
'''Loads a CSV into DataStore. Does not create the indexes.'''

# use messytables to determine the header row
Expand All @@ -64,7 +64,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
header_offset, headers = messytables.headers_guess(row_set.sample)

# Some headers might have been converted from strings to floats and such.
headers = encode_headers(headers)
headers = encode_headers(headers, unicode_headers=unicode_headers)

# Guess the delimiter used in the file
with open(csv_filepath, 'r') as f:
Expand Down Expand Up @@ -196,6 +196,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
# 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
# the superuser issue. <-- picked

if unicode_headers or config.get('ckanext.xloader.unicode_headers'):
column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers])
else:
column_names = ', '.join(['"{}"'.format(h) for h in headers])
raw_connection = engine.raw_connection()
try:
cur = raw_connection.cursor()
Expand All @@ -211,8 +215,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
" ENCODING '{encoding}');"
.format(
resource_id=resource_id,
column_names=', '.join(['"{}"'.format(h)
for h in headers]),
column_names=column_names,
delimiter=delimiter,
encoding='UTF8',
),
Expand All @@ -236,7 +239,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('...copying done')

logger.info('Creating search index...')
_populate_fulltext(connection, resource_id, fields=fields)

if unicode_headers or config.get('ckanext.xloader.unicode_headers'):
encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields]
else:
encoded_fields = fields

_populate_fulltext(connection, resource_id, fields=encoded_fields)
logger.info('...search index created')

return fields
Expand All @@ -259,7 +268,7 @@ def create_column_indexes(fields, resource_id, logger):
logger.info('...column indexes created.')


def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None, unicode_headers=None):
'''Loads an Excel file (or other tabular data recognized by messytables)
into Datastore and creates indexes.

Expand Down Expand Up @@ -299,7 +308,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
for f in existing.get('fields', []) if 'info' in f)

# Some headers might have been converted from strings to floats and such.
headers = encode_headers(headers)
headers = encode_headers(headers, unicode_headers=unicode_headers)

row_set.register_processor(messytables.headers_processor(headers))
row_set.register_processor(messytables.offset_processor(offset + 1))
Expand Down Expand Up @@ -400,13 +409,17 @@ def get_types():
return _TYPES, TYPE_MAPPING


def encode_headers(headers):
def encode_headers(headers, unicode_headers=None):
if unicode_headers or config.get('ckanext.xloader.unicode_headers'):
decode_func = unicode
else:
decode_func = unidecode
encoded_headers = []
for header in headers:
try:
encoded_headers.append(unidecode(header))
encoded_headers.append(decode_func(header))
except AttributeError:
encoded_headers.append(unidecode(str(header)))
encoded_headers.append(decode_func(str(header)))

return encoded_headers

Expand Down Expand Up @@ -514,7 +527,7 @@ def _populate_fulltext(connection, resource_id, fields):
(text/numeric/timestamp)
'''
sql = \
u'''
'''
UPDATE {table}
SET _full_text = to_tsvector({cols});
'''.format(
Expand Down Expand Up @@ -560,8 +573,8 @@ def _create_fulltext_trigger(connection, resource_id):
def identifier(s):
# "%" needs to be escaped, otherwise connection.execute thinks it is for
# substituting a bind parameter
return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\
+ u'"'
return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%')\
+ '"'


def literal_string(s):
Expand Down
1 change: 1 addition & 0 deletions ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

class XLoaderFormats(object):
formats = None

@classmethod
def is_it_an_xloader_format(cls, format_):
if cls.formats is None:
Expand Down
7 changes: 7 additions & 0 deletions ckanext/xloader/tests/samples/hebrew_sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה
229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20
229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00
229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20
229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70
229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60
229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10
Binary file added ckanext/xloader/tests/samples/hebrew_sample.xlsx
Binary file not shown.
70 changes: 68 additions & 2 deletions ckanext/xloader/tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ def _get_records(self, table_name, limit=None,
if col != '_full_text')
else:
cols = '*'
sql = 'SELECT {cols} FROM "{table_name}"' \
sql = u'SELECT {cols} FROM "{table_name}"' \
.format(cols=cols, table_name=table_name)
if limit is not None:
sql += ' LIMIT {}'.format(limit)
sql += u' LIMIT {}'.format(limit)
results = c.execute(sql)
return results.fetchall()

Expand Down Expand Up @@ -344,6 +344,38 @@ def test_column_names(self):
assert_equal(self._get_records('test1')[0],
(1, u'2011-01-01', u'1', u'Galway'))

def test_unicode_column_names(self):
csv_filepath = get_sample_filepath('hebrew_sample.csv')
resource_id = 'test_hebrew'
factories.Resource(id=resource_id)
loader.load_csv(csv_filepath, resource_id=resource_id,
mimetype='text/csv', logger=PrintLogger(), unicode_headers=True)
records = self._get_records('test_hebrew')
print records
assert_equal(
records[0],
(1, u'229312', u'פ בית העמק עמקה 3', u'360', u'פרטי', u'Cl', u'תקן ישראלי מותר', u'400', u'20/09/2018',
u'44.85', u'11.20')
)
print self._get_column_names('test_hebrew')
assert_equal(
self._get_column_names('test_hebrew'),
[
u'_id',
u'_full_text',
u'זיהוי',
u'שם',
u'תא דיווח',
u'שימוש',
u'פרמטר',
u'סוג תקן מי שתייה',
u'ערך תקן',
u'תאריך דיגום אחרון',
u'ריכוז אחרון',
u'אחוז מתקן מי השתיה'
]
)


class TestLoadUnhandledTypes(TestLoadBase):

Expand Down Expand Up @@ -478,3 +510,37 @@ def test_no_entries(self):
with assert_raises(LoaderError):
loader.load_table(csv_filepath, resource_id=resource_id,
mimetype='csv', logger=PrintLogger())

def test_hebrew_unicode_headers(self):
xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx')
resource_id = 'hebrew_sample_xlsx'
factories.Resource(id=resource_id)
loader.load_table(xlsx_filepath, resource_id=resource_id,
mimetype='xlsx', logger=PrintLogger(), unicode_headers=True)
records = self._get_records('hebrew_sample_xlsx')
print records
assert_equal(
records[0],
(1, Decimal('229312'), u'פ בית העמק עמקה 3', Decimal('360'), u'פרטי', u'Cl', u'תקן ישראלי מותר',
Decimal('400'), datetime.datetime(2018, 9, 20, 0, 0),
Decimal('44.85000000000000142108547152020037174224853515625'),
Decimal('11.199999999999999289457264239899814128875732421875'))
)
print self._get_column_names('hebrew_sample_xlsx')
assert_equal(
self._get_column_names('hebrew_sample_xlsx'),
[
u'_id',
u'_full_text',
u'זיהוי',
u'שם',
u'תא דיווח',
u'שימוש',
u'פרמטר',
u'סוג תקן מי שתייה',
u'ערך תקן',
u'תאריך דיגום אחרון',
u'ריכוז אחרון',
u'אחוז מתקן מי השתיה'
]
)