ckan · OriHoch · Aug 17, 2020 · Aug 17, 2020 · Aug 20, 2020
diff --git a/README.rst b/README.rst
@@ -260,6 +260,12 @@ Configuration:
  # not be loaded into the datastore.
  ckanext.xloader.max_excerpt_lines = 100
 
+ # If set to True allows unicode characters in header names.
+ # If set to False (default), characters are encoded to ascii
+ # using the unidecode library.
+ ckanext.xloader.unicode_headers = False
+
+
 ------------------------
 Developer installation
 ------------------------

diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -37,7 +37,7 @@ def get_write_engine():
 MAX_COLUMN_LENGTH = 63
 
 
-def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
+def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None, unicode_headers=None):
  '''Loads a CSV into DataStore. Does not create the indexes.'''
 
  # use messytables to determine the header row
@@ -64,7 +64,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
  header_offset, headers = messytables.headers_guess(row_set.sample)
 
  # Some headers might have been converted from strings to floats and such.
- headers = encode_headers(headers)
+ headers = encode_headers(headers, unicode_headers=unicode_headers)
 
  # Guess the delimiter used in the file
  with open(csv_filepath, 'r') as f:
@@ -196,6 +196,10 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
  # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
  # the superuser issue. <-- picked
 
+ if unicode_headers or config.get('ckanext.xloader.unicode_headers'):
+ column_names = ', '.join(['"{}"'.format(h.encode('UTF8')) for h in headers])
+ else:
+ column_names = ', '.join(['"{}"'.format(h) for h in headers])
  raw_connection = engine.raw_connection()
  try:
  cur = raw_connection.cursor()
@@ -211,8 +215,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
  " ENCODING '{encoding}');"
  .format(
  resource_id=resource_id,
- column_names=', '.join(['"{}"'.format(h)
- for h in headers]),
+ column_names=column_names,
  delimiter=delimiter,
  encoding='UTF8',
  ),
@@ -236,7 +239,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
  logger.info('...copying done')
 
  logger.info('Creating search index...')
- _populate_fulltext(connection, resource_id, fields=fields)
+
+ if unicode_headers or config.get('ckanext.xloader.unicode_headers'):
+ encoded_fields = [{'type': x['type'], 'id': x['id'].encode('UTF8')} for x in fields]
+ else:
+ encoded_fields = fields
+
+ _populate_fulltext(connection, resource_id, fields=encoded_fields)
  logger.info('...search index created')
 
  return fields
@@ -259,7 +268,7 @@ def create_column_indexes(fields, resource_id, logger):
  logger.info('...column indexes created.')
 
 
-def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
+def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None, unicode_headers=None):
  '''Loads an Excel file (or other tabular data recognized by messytables)
  into Datastore and creates indexes.
 
@@ -299,7 +308,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
  for f in existing.get('fields', []) if 'info' in f)
 
  # Some headers might have been converted from strings to floats and such.
- headers = encode_headers(headers)
+ headers = encode_headers(headers, unicode_headers=unicode_headers)
 
  row_set.register_processor(messytables.headers_processor(headers))
  row_set.register_processor(messytables.offset_processor(offset + 1))
@@ -400,13 +409,17 @@ def get_types():
  return _TYPES, TYPE_MAPPING
 
 
-def encode_headers(headers):
+def encode_headers(headers, unicode_headers=None):
+ if unicode_headers or config.get('ckanext.xloader.unicode_headers'):
+ decode_func = unicode
+ else:
+ decode_func = unidecode
  encoded_headers = []
  for header in headers:
  try:
- encoded_headers.append(unidecode(header))
+ encoded_headers.append(decode_func(header))
  except AttributeError:
- encoded_headers.append(unidecode(str(header)))
+ encoded_headers.append(decode_func(str(header)))
 
  return encoded_headers
 
@@ -514,7 +527,7 @@ def _populate_fulltext(connection, resource_id, fields):
  (text/numeric/timestamp)
  '''
  sql = \
- u'''
+ '''
  UPDATE {table}
  SET _full_text = to_tsvector({cols});
  '''.format(
@@ -560,8 +573,8 @@ def _create_fulltext_trigger(connection, resource_id):
 def identifier(s):
  # "%" needs to be escaped, otherwise connection.execute thinks it is for
  # substituting a bind parameter
- return u'"' + s.replace(u'"', u'""').replace(u'\0', '').replace('%', '%%')\
- + u'"'
+ return '"' + s.replace('"', '""').replace('\0', '').replace('%', '%%')\
+ + '"'
 
 
 def literal_string(s):

diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
@@ -23,6 +23,7 @@
 
 class XLoaderFormats(object):
  formats = None
+
  @classmethod
  def is_it_an_xloader_format(cls, format_):
  if cls.formats is None:

diff --git a/ckanext/xloader/tests/samples/hebrew_sample.csv b/ckanext/xloader/tests/samples/hebrew_sample.csv
@@ -0,0 +1,7 @@
+זיהוי,שם,תא דיווח,שימוש,פרמטר,סוג תקן מי שתייה,ערך תקן,תאריך דיגום אחרון,ריכוז אחרון,אחוז מתקן מי השתיה
+229312,פ בית העמק עמקה 3,360,פרטי,Cl,תקן ישראלי מותר,400,20/09/2018,44.85,11.20
+229312,פ בית העמק עמקה 3,360,פרטי,NO3,תקן ישראלי מותר,70,20/09/2018,32.90,47.00
+229319,פ כברי החוגים,350,פרטי,Cl,תקן ישראלי מותר,400,08/08/2019,44.80,11.20
+229319,פ כברי החוגים,350,פרטי,NO3,תקן ישראלי מותר,70,08/08/2019,49.50,70.70
+229323,פ לוחמי הגיטאות דרור,330,פרטי,Cl,תקן ישראלי מותר,400,04/09/2018,846.55,211.60
+229323,פ לוחמי הגיטאות דרור,330,פרטי,NO3,תקן ישראלי מותר,70,04/09/2018,22.50,32.10
diff --git a/ckanext/xloader/tests/samples/hebrew_sample.xlsx b/ckanext/xloader/tests/samples/hebrew_sample.xlsx
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
@@ -51,10 +51,10 @@ def _get_records(self, table_name, limit=None,
  if col != '_full_text')
  else:
  cols = '*'
- sql = 'SELECT {cols} FROM "{table_name}"' \
+ sql = u'SELECT {cols} FROM "{table_name}"' \
  .format(cols=cols, table_name=table_name)
  if limit is not None:
- sql += ' LIMIT {}'.format(limit)
+ sql += u' LIMIT {}'.format(limit)
  results = c.execute(sql)
  return results.fetchall()
 
@@ -344,6 +344,38 @@ def test_column_names(self):
  assert_equal(self._get_records('test1')[0],
  (1, u'2011-01-01', u'1', u'Galway'))
 
+ def test_unicode_column_names(self):
+ csv_filepath = get_sample_filepath('hebrew_sample.csv')
+ resource_id = 'test_hebrew'
+ factories.Resource(id=resource_id)
+ loader.load_csv(csv_filepath, resource_id=resource_id,
+ mimetype='text/csv', logger=PrintLogger(), unicode_headers=True)
+ records = self._get_records('test_hebrew')
+ print records
+ assert_equal(
+ records[0],
+ (1, u'229312', u'פ בית העמק עמקה 3', u'360', u'פרטי', u'Cl', u'תקן ישראלי מותר', u'400', u'20/09/2018',
+ u'44.85', u'11.20')
+ )
+ print self._get_column_names('test_hebrew')
+ assert_equal(
+ self._get_column_names('test_hebrew'),
+ [
+ u'_id',
+ u'_full_text',
+ u'זיהוי',
+ u'שם',
+ u'תא דיווח',
+ u'שימוש',
+ u'פרמטר',
+ u'סוג תקן מי שתייה',
+ u'ערך תקן',
+ u'תאריך דיגום אחרון',
+ u'ריכוז אחרון',
+ u'אחוז מתקן מי השתיה'
+ ]
+ )
+
 
 class TestLoadUnhandledTypes(TestLoadBase):
 
@@ -478,3 +510,37 @@ def test_no_entries(self):
  with assert_raises(LoaderError):
  loader.load_table(csv_filepath, resource_id=resource_id,
  mimetype='csv', logger=PrintLogger())
+
+ def test_hebrew_unicode_headers(self):
+ xlsx_filepath = get_sample_filepath('hebrew_sample.xlsx')
+ resource_id = 'hebrew_sample_xlsx'
+ factories.Resource(id=resource_id)
+ loader.load_table(xlsx_filepath, resource_id=resource_id,
+ mimetype='xlsx', logger=PrintLogger(), unicode_headers=True)
+ records = self._get_records('hebrew_sample_xlsx')
+ print records
+ assert_equal(
+ records[0],
+ (1, Decimal('229312'), u'פ בית העמק עמקה 3', Decimal('360'), u'פרטי', u'Cl', u'תקן ישראלי מותר',
+ Decimal('400'), datetime.datetime(2018, 9, 20, 0, 0),
+ Decimal('44.85000000000000142108547152020037174224853515625'),
+ Decimal('11.199999999999999289457264239899814128875732421875'))
+ )
+ print self._get_column_names('hebrew_sample_xlsx')
+ assert_equal(
+ self._get_column_names('hebrew_sample_xlsx'),
+ [
+ u'_id',
+ u'_full_text',
+ u'זיהוי',
+ u'שם',
+ u'תא דיווח',
+ u'שימוש',
+ u'פרמטר',
+ u'סוג תקן מי שתייה',
+ u'ערך תקן',
+ u'תאריך דיגום אחרון',
+ u'ריכוז אחרון',
+ u'אחוז מתקן מי השתיה'
+ ]
+ )