dbt-labs · drewbanin · Jul 1, 2021 · Jun 26, 2021 · Jun 30, 2021 · Jun 30, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Fixes
 - Fix docs generation for cross-db sources in REDSHIFT RA3 node ([#3236](https://github.com/fishtown-analytics/dbt/issues/3236), [#3408](https://github.com/fishtown-analytics/dbt/pull/3408))
+- Fix type coercion issues when fetching query result sets ([#2984](https://github.com/fishtown-analytics/dbt/issues/2984), [#3499](https://github.com/fishtown-analytics/dbt/pull/3499))
 
 ### Under the hood
 - Improve default view and table materialization performance by checking relational cache before attempting to drop temp relations ([#3112](https://github.com/fishtown-analytics/dbt/issues/3112), [#3468](https://github.com/fishtown-analytics/dbt/pull/3468))

diff --git a/core/dbt/clients/agate_helper.py b/core/dbt/clients/agate_helper.py
@@ -35,7 +35,11 @@ def cast(self, d):
  )
 
 
-def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
+def build_type_tester(
+ text_columns: Iterable[str],
+ string_null_values: Optional[Iterable[str]] = ('null', '')
+) -> agate.TypeTester:
+
  types = [
  agate.data_types.Number(null_values=('null', '')),
  agate.data_types.Date(null_values=('null', ''),
@@ -46,10 +50,10 @@ def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
  agate.data_types.Boolean(true_values=('true',),
  false_values=('false',),
  null_values=('null', '')),
- agate.data_types.Text(null_values=('null', ''))
+ agate.data_types.Text(null_values=string_null_values)
  ]
  force = {
- k: agate.data_types.Text(null_values=('null', ''))
+ k: agate.data_types.Text(null_values=string_null_values)
  for k in text_columns
  }
  return agate.TypeTester(force=force, types=types)
@@ -66,7 +70,13 @@ def table_from_rows(
  if text_only_columns is None:
  column_types = DEFAULT_TYPE_TESTER
  else:
- column_types = build_type_tester(text_only_columns)
+ # If text_only_columns are present, prevent coercing empty string or
+ # literal 'null' strings to a None representation.
+ column_types = build_type_tester(
+ text_only_columns,
+ string_null_values=()
+ )
+
  return agate.Table(rows, column_names, column_types=column_types)
 
 
@@ -86,19 +96,34 @@ def table_from_data(data, column_names: Iterable[str]) -> agate.Table:
 
 
 def table_from_data_flat(data, column_names: Iterable[str]) -> agate.Table:
- "Convert list of dictionaries into an Agate table"
+ """
+ Convert a list of dictionaries into an Agate table. This method does not
+ coerce string values into more specific types (eg. '005' will not be
+ coerced to '5'). Additionally, this method does not coerce values to
+ None (eg. '' or 'null' will retain their string literal representations).
+ """
 
  rows = []
+ text_only_columns = set()
  for _row in data:
  row = []
- for value in list(_row.values()):
+ for col_name in column_names:
+ value = _row[col_name]
  if isinstance(value, (dict, list, tuple)):
- row.append(json.dumps(value, cls=dbt.utils.JSONEncoder))
- else:
- row.append(value)
+ # Represent container types as json strings
+ value = json.dumps(value, cls=dbt.utils.JSONEncoder)
+ text_only_columns.add(col_name)
+ elif isinstance(value, str):
+ text_only_columns.add(col_name)
+ row.append(value)
+
  rows.append(row)
 
- return table_from_rows(rows=rows, column_names=column_names)
+ return table_from_rows(
+ rows=rows,
+ column_names=column_names,
+ text_only_columns=text_only_columns
+ )
 
 
 def empty_table():

diff --git a/test/integration/100_rpc_test/sql/generic.sql b/test/integration/100_rpc_test/sql/generic.sql
@@ -0,0 +1,29 @@
+
+/*
+ Every column returned by this query should be interpreted
+ as a string. If any result is coerced into a non-string (eg.
+ an int, float, bool, date, NoneType, etc) then dbt did the
+ wrong thing
+*/
+
+select
+ '' as str_empty_string,
+
+ 'null' as str_null,
+
+ '1' as str_int,
+ '00005' as str_int_2,
+ '00' as str_int_3,
+
+ '1.1' as str_float,
+ '00001.1' as str_float_2,
+
+ 'true' as str_bool,
+ 'True' as str_bool_2,
+
+ '2021-01-01' as str_date,
+ '2021-01-01T12:00:00Z' as str_datetime,
+
+ -- this is obviously not a date... but Agate used to think it was!
+ -- see: https://github.com/fishtown-analytics/dbt/issues/2984
+ '0010T00000aabbccdd' as str_obviously_not_date
diff --git a/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py b/test/integration/100_rpc_test/test_execute_fetch_and_serialize.py
@@ -39,6 +39,12 @@ def do_test_file(self, filename):
  self.assertTrue(len(table.rows) > 0, "agate table had no rows")
 
  self.do_test_pickle(table)
+ return table
+
+ def assert_all_columns_are_strings(self, table):
+ for row in table:
+ for value in row:
+ self.assertEqual(type(value), str, f'Found a not-string: {value} in row {row}')
 
  @use_profile('bigquery')
  def test__bigquery_fetch_and_serialize(self):
@@ -51,3 +57,18 @@ def test__snowflake_fetch_and_serialize(self):
  @use_profile('redshift')
  def test__redshift_fetch_and_serialize(self):
  self.do_test_file('redshift.sql')
+
+ @use_profile('bigquery')
+ def test__bigquery_type_coercion(self):
+ table = self.do_test_file('generic.sql')
+ self.assert_all_columns_are_strings(table)
+
+ @use_profile('snowflake')
+ def test__snowflake_type_coercion(self):
+ table = self.do_test_file('generic.sql')
+ self.assert_all_columns_are_strings(table)
+
+ @use_profile('redshift')
+ def test__redshift_type_coercion(self):
+ table = self.do_test_file('generic.sql')
+ self.assert_all_columns_are_strings(table)
diff --git a/test/unit/test_agate_helper.py b/test/unit/test_agate_helper.py
@@ -133,3 +133,24 @@ def test_merge_mixed(self):
  assert isinstance(result.column_types[1], agate.data_types.Text)
  assert isinstance(result.column_types[2], agate.data_types.Text)
  self.assertEqual(len(result), 6)
+
+ def test_nocast_string_types(self):
+ # String fields should not be coerced into a representative type
+ # See: https://github.com/fishtown-analytics/dbt/issues/2984
+
+ column_names = ['a', 'b', 'c', 'd', 'e']
+ result_set = [
+ {'a': '0005', 'b': '01T00000aabbccdd', 'c': 'true', 'd': 10, 'e': False},
+ {'a': '0006', 'b': '01T00000aabbccde', 'c': 'false', 'd': 11, 'e': True},
+ ]
+
+ tbl = agate_helper.table_from_data_flat(data=result_set, column_names=column_names)
+ self.assertEqual(len(tbl), len(result_set))
+
+ expected = [
+ ['0005', '01T00000aabbccdd', 'true', Decimal(10), False],
+ ['0006', '01T00000aabbccde', 'false', Decimal(11), True],
+ ]
+
+ for i, row in enumerate(tbl):
+ self.assertEqual(list(row), expected[i])