From 7001afbcbe4a3b6b63e90985680c8888ee96262a Mon Sep 17 00:00:00 2001 From: Drew Banin Date: Tue, 30 Jul 2019 14:33:32 -0400 Subject: [PATCH 1/2] (#1632) fix for unicode chars in seed files --- core/dbt/clients/agate_helper.py | 4 +++- core/dbt/compat.py | 14 ++++++++++++ .../data-unicode/seed_unicode.csv | 2 ++ .../005_simple_seed_test/test_simple_seed.py | 22 +++++++++++++++++++ 4 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 test/integration/005_simple_seed_test/data-unicode/seed_unicode.csv diff --git a/core/dbt/clients/agate_helper.py b/core/dbt/clients/agate_helper.py index 446056a5e8f..f810f2ff176 100644 --- a/core/dbt/clients/agate_helper.py +++ b/core/dbt/clients/agate_helper.py @@ -49,4 +49,6 @@ def from_csv(abspath): with dbt.compat.open_file(abspath) as fp: if fp.read(1) != BOM: fp.seek(0) - return agate.Table.from_csv(fp, column_types=DEFAULT_TYPE_TESTER) + + file_buf = dbt.compat.read_into_buffer(fp) + return agate.Table.from_csv(file_buf, column_types=DEFAULT_TYPE_TESTER) diff --git a/core/dbt/compat.py b/core/dbt/compat.py index b2533c6d8b9..a56b9fbae98 100644 --- a/core/dbt/compat.py +++ b/core/dbt/compat.py @@ -36,11 +36,13 @@ from SocketServer import TCPServer from Queue import PriorityQueue, Empty as QueueEmpty from thread import get_ident + from StringIO import StringIO else: from http.server import SimpleHTTPRequestHandler from socketserver import TCPServer from queue import PriorityQueue, Empty as QueueEmpty from threading import get_ident + from io import StringIO def to_unicode(s): @@ -98,6 +100,18 @@ def open_file(path): return open(path, encoding='utf-8') +def read_into_buffer(fp): + buf = StringIO() + + if WHICH_PYTHON == 2: + buf.write(fp.read().encode('utf-8')) + else: + buf.write(fp.read()) + + buf.seek(0) + return buf + + if WHICH_PYTHON == 2: # In python 2, classmethod and staticmethod do not allow setters, so you # can't treat classmethods as first-class objects like you can regular diff --git a/test/integration/005_simple_seed_test/data-unicode/seed_unicode.csv b/test/integration/005_simple_seed_test/data-unicode/seed_unicode.csv new file mode 100644 index 00000000000..a1abbbd4af7 --- /dev/null +++ b/test/integration/005_simple_seed_test/data-unicode/seed_unicode.csv @@ -0,0 +1,2 @@ +id +Uh – Oh diff --git a/test/integration/005_simple_seed_test/test_simple_seed.py b/test/integration/005_simple_seed_test/test_simple_seed.py index 6d8d9fde84c..c7f91f17bd7 100644 --- a/test/integration/005_simple_seed_test/test_simple_seed.py +++ b/test/integration/005_simple_seed_test/test_simple_seed.py @@ -197,3 +197,25 @@ def test_simple_seed(self): results = self.run_dbt(["seed"]) self.assertEqual(len(results), 1) self.assertTablesEqual("seed_bom", "seed_expected") + + +class TestSimpleSeedWithUnicode(DBTIntegrationTest): + + @property + def schema(self): + return "simple_seed_005" + + @property + def models(self): + return "models" + + @property + def project_config(self): + return { + "data-paths": ['data-unicode'] + } + + @use_profile('postgres') + def test_simple_seed(self): + results = self.run_dbt(["seed"]) + self.assertEqual(len(results), 1) From 8fd768e46b64d90d17236798906202a56ae119ba Mon Sep 17 00:00:00 2001 From: Jacob Beck Date: Thu, 1 Aug 2019 12:40:00 -0600 Subject: [PATCH 2/2] for agate, use the "Urb" mode on python 2, handle BOM fiddling --- core/dbt/clients/agate_helper.py | 12 +++--------- core/dbt/compat.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/core/dbt/clients/agate_helper.py b/core/dbt/clients/agate_helper.py index f810f2ff176..9397695de01 100644 --- a/core/dbt/clients/agate_helper.py +++ b/core/dbt/clients/agate_helper.py @@ -1,11 +1,7 @@ -from codecs import BOM_UTF8 - import dbt.compat import agate -BOM = BOM_UTF8.decode('utf-8') # '\ufeff' - DEFAULT_TYPE_TESTER = agate.TypeTester(types=[ agate.data_types.Number(null_values=('null', '')), agate.data_types.TimeDelta(null_values=('null', '')), @@ -46,9 +42,7 @@ def as_matrix(table): def from_csv(abspath): - with dbt.compat.open_file(abspath) as fp: - if fp.read(1) != BOM: + with dbt.compat.open_seed_file(abspath) as fp: + if fp.read(len(dbt.compat.BOM_UTF8)) != dbt.compat.BOM_UTF8: fp.seek(0) - - file_buf = dbt.compat.read_into_buffer(fp) - return agate.Table.from_csv(file_buf, column_types=DEFAULT_TYPE_TESTER) + return agate.Table.from_csv(fp, column_types=DEFAULT_TYPE_TESTER) diff --git a/core/dbt/compat.py b/core/dbt/compat.py index a56b9fbae98..aeb79555a36 100644 --- a/core/dbt/compat.py +++ b/core/dbt/compat.py @@ -36,13 +36,11 @@ from SocketServer import TCPServer from Queue import PriorityQueue, Empty as QueueEmpty from thread import get_ident - from StringIO import StringIO else: from http.server import SimpleHTTPRequestHandler from socketserver import TCPServer from queue import PriorityQueue, Empty as QueueEmpty from threading import get_ident - from io import StringIO def to_unicode(s): @@ -100,16 +98,18 @@ def open_file(path): return open(path, encoding='utf-8') -def read_into_buffer(fp): - buf = StringIO() +if WHICH_PYTHON == 2: + BOM_UTF8 = codecs.BOM_UTF8 +else: + BOM_UTF8 = codecs.BOM_UTF8.decode('utf-8') + +def open_seed_file(path): if WHICH_PYTHON == 2: - buf.write(fp.read().encode('utf-8')) + fp = open(path, 'Urb') else: - buf.write(fp.read()) - - buf.seek(0) - return buf + fp = open(path, encoding='utf-8') + return fp if WHICH_PYTHON == 2: