Skip to content

Commit e0f9f65

Browse files
authored
fix(expr-common): Coerce to Decimal(20, 0) when combining UInt64 with signed integers (#14223)
* fix(expr-common): Coerce to Decimal(20, 0) when combining UInt64 with signed integers Previously, when combining UInt64 with any signed integer, the resulting type would be Int64, which would result in lost information. Now, combining UInt64 with a signed integer results in a Decimal(20, 0), which is able to encode all (64-bit) integer types. * test: Add sqllogictest for #14208 * refactor: Move unsigned integer and decimal coercion to coerce_numeric_type_to_decimal * fix: Also handle unsigned integers when coercing to Decimal256 * fix: Coerce UInt64 and other unsigned integer to UInt64
1 parent 49f95af commit e0f9f65

File tree

5 files changed

+69
-34
lines changed

5 files changed

+69
-34
lines changed

datafusion/expr-common/src/type_coercion/binary.rs

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -777,29 +777,19 @@ pub fn binary_numeric_coercion(
777777
(_, Float32) | (Float32, _) => Some(Float32),
778778
// The following match arms encode the following logic: Given the two
779779
// integral types, we choose the narrowest possible integral type that
780-
// accommodates all values of both types. Note that some information
781-
// loss is inevitable when we have a signed type and a `UInt64`, in
782-
// which case we use `Int64`;i.e. the widest signed integral type.
783-
784-
// TODO: For i64 and u64, we can use decimal or float64
785-
// Postgres has no unsigned type :(
786-
// DuckDB v.0.10.0 has double (double precision floating-point number (8 bytes))
787-
// for largest signed (signed sixteen-byte integer) and unsigned integer (unsigned sixteen-byte integer)
780+
// accommodates all values of both types. Note that to avoid information
781+
// loss when combining UInt64 with signed integers we use Decimal128(20, 0).
782+
(UInt64, Int64 | Int32 | Int16 | Int8)
783+
| (Int64 | Int32 | Int16 | Int8, UInt64) => Some(Decimal128(20, 0)),
784+
(UInt64, _) | (_, UInt64) => Some(UInt64),
788785
(Int64, _)
789786
| (_, Int64)
790-
| (UInt64, Int8)
791-
| (Int8, UInt64)
792-
| (UInt64, Int16)
793-
| (Int16, UInt64)
794-
| (UInt64, Int32)
795-
| (Int32, UInt64)
796787
| (UInt32, Int8)
797788
| (Int8, UInt32)
798789
| (UInt32, Int16)
799790
| (Int16, UInt32)
800791
| (UInt32, Int32)
801792
| (Int32, UInt32) => Some(Int64),
802-
(UInt64, _) | (_, UInt64) => Some(UInt64),
803793
(Int32, _)
804794
| (_, Int32)
805795
| (UInt16, Int16)
@@ -928,16 +918,16 @@ pub fn get_wider_type(lhs: &DataType, rhs: &DataType) -> Result<DataType> {
928918
}
929919

930920
/// Convert the numeric data type to the decimal data type.
931-
/// Now, we just support the signed integer type and floating-point type.
921+
/// We support signed and unsigned integer types and floating-point type.
932922
fn coerce_numeric_type_to_decimal(numeric_type: &DataType) -> Option<DataType> {
933923
use arrow::datatypes::DataType::*;
934924
// This conversion rule is from spark
935925
// https://github.com/apache/spark/blob/1c81ad20296d34f137238dadd67cc6ae405944eb/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L127
936926
match numeric_type {
937-
Int8 => Some(Decimal128(3, 0)),
938-
Int16 => Some(Decimal128(5, 0)),
939-
Int32 => Some(Decimal128(10, 0)),
940-
Int64 => Some(Decimal128(20, 0)),
927+
Int8 | UInt8 => Some(Decimal128(3, 0)),
928+
Int16 | UInt16 => Some(Decimal128(5, 0)),
929+
Int32 | UInt32 => Some(Decimal128(10, 0)),
930+
Int64 | UInt64 => Some(Decimal128(20, 0)),
941931
// TODO if we convert the floating-point data to the decimal type, it maybe overflow.
942932
Float32 => Some(Decimal128(14, 7)),
943933
Float64 => Some(Decimal128(30, 15)),
@@ -946,16 +936,16 @@ fn coerce_numeric_type_to_decimal(numeric_type: &DataType) -> Option<DataType> {
946936
}
947937

948938
/// Convert the numeric data type to the decimal data type.
949-
/// Now, we just support the signed integer type and floating-point type.
939+
/// We support signed and unsigned integer types and floating-point type.
950940
fn coerce_numeric_type_to_decimal256(numeric_type: &DataType) -> Option<DataType> {
951941
use arrow::datatypes::DataType::*;
952942
// This conversion rule is from spark
953943
// https://github.com/apache/spark/blob/1c81ad20296d34f137238dadd67cc6ae405944eb/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L127
954944
match numeric_type {
955-
Int8 => Some(Decimal256(3, 0)),
956-
Int16 => Some(Decimal256(5, 0)),
957-
Int32 => Some(Decimal256(10, 0)),
958-
Int64 => Some(Decimal256(20, 0)),
945+
Int8 | UInt8 => Some(Decimal256(3, 0)),
946+
Int16 | UInt16 => Some(Decimal256(5, 0)),
947+
Int32 | UInt32 => Some(Decimal256(10, 0)),
948+
Int64 | UInt64 => Some(Decimal256(20, 0)),
959949
// TODO if we convert the floating-point data to the decimal type, it maybe overflow.
960950
Float32 => Some(Decimal256(14, 7)),
961951
Float64 => Some(Decimal256(30, 15)),
@@ -1994,6 +1984,18 @@ mod tests {
19941984
Operator::Gt,
19951985
DataType::UInt32
19961986
);
1987+
test_coercion_binary_rule!(
1988+
DataType::UInt64,
1989+
DataType::UInt8,
1990+
Operator::Eq,
1991+
DataType::UInt64
1992+
);
1993+
test_coercion_binary_rule!(
1994+
DataType::UInt64,
1995+
DataType::Int64,
1996+
Operator::Eq,
1997+
DataType::Decimal128(20, 0)
1998+
);
19971999
// numeric/decimal
19982000
test_coercion_binary_rule!(
19992001
DataType::Int64,
@@ -2025,6 +2027,12 @@ mod tests {
20252027
Operator::GtEq,
20262028
DataType::Decimal128(15, 3)
20272029
);
2030+
test_coercion_binary_rule!(
2031+
DataType::UInt64,
2032+
DataType::Decimal128(20, 0),
2033+
Operator::Eq,
2034+
DataType::Decimal128(20, 0)
2035+
);
20282036

20292037
// Binary
20302038
test_coercion_binary_rule!(

datafusion/sqllogictest/test_files/coalesce.slt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,13 @@ select
105105
----
106106
2 Int64
107107

108-
# TODO: Got types (i64, u64), casting to decimal or double or even i128 if supported
109-
query IT
108+
# i64 and u64, cast to decimal128(20, 0)
109+
query RT
110110
select
111111
coalesce(2, arrow_cast(3, 'UInt64')),
112112
arrow_typeof(coalesce(2, arrow_cast(3, 'UInt64')));
113113
----
114-
2 Int64
114+
2 Decimal128(20, 0)
115115

116116
statement ok
117117
create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null);

datafusion/sqllogictest/test_files/insert.slt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,3 +433,30 @@ drop table test_column_defaults
433433

434434
statement error DataFusion error: Error during planning: Column reference is not allowed in the DEFAULT expression : Schema error: No field named a.
435435
create table test_column_defaults(a int, b int default a+1)
436+
437+
438+
# test inserting UInt64 and signed integers into a bigint unsigned column
439+
statement ok
440+
create table unsigned_bigint_test (v bigint unsigned)
441+
442+
query I
443+
insert into unsigned_bigint_test values (10000000000000000000), (18446744073709551615)
444+
----
445+
2
446+
447+
query I
448+
insert into unsigned_bigint_test values (10000000000000000001), (1), (10000000000000000002)
449+
----
450+
3
451+
452+
query I rowsort
453+
select * from unsigned_bigint_test
454+
----
455+
1
456+
10000000000000000000
457+
10000000000000000001
458+
10000000000000000002
459+
18446744073709551615
460+
461+
statement ok
462+
drop table unsigned_bigint_test

datafusion/sqllogictest/test_files/joins.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1828,7 +1828,7 @@ AS VALUES
18281828
('BB', 6, 1),
18291829
('BB', 6, 1);
18301830

1831-
query TII
1831+
query TIR
18321832
select col1, col2, coalesce(sum_col3, 0) as sum_col3
18331833
from (select distinct col2 from tbl) AS q1
18341834
cross join (select distinct col1 from tbl) AS q2

datafusion/sqllogictest/test_files/union.slt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -413,23 +413,23 @@ explain SELECT c1, c9 FROM aggregate_test_100 UNION ALL SELECT c1, c3 FROM aggre
413413
logical_plan
414414
01)Sort: aggregate_test_100.c9 DESC NULLS FIRST, fetch=5
415415
02)--Union
416-
03)----Projection: aggregate_test_100.c1, CAST(aggregate_test_100.c9 AS Int64) AS c9
416+
03)----Projection: aggregate_test_100.c1, CAST(aggregate_test_100.c9 AS Decimal128(20, 0)) AS c9
417417
04)------TableScan: aggregate_test_100 projection=[c1, c9]
418-
05)----Projection: aggregate_test_100.c1, CAST(aggregate_test_100.c3 AS Int64) AS c9
418+
05)----Projection: aggregate_test_100.c1, CAST(aggregate_test_100.c3 AS Decimal128(20, 0)) AS c9
419419
06)------TableScan: aggregate_test_100 projection=[c1, c3]
420420
physical_plan
421421
01)SortPreservingMergeExec: [c9@1 DESC], fetch=5
422422
02)--UnionExec
423423
03)----SortExec: TopK(fetch=5), expr=[c9@1 DESC], preserve_partitioning=[true]
424-
04)------ProjectionExec: expr=[c1@0 as c1, CAST(c9@1 AS Int64) as c9]
424+
04)------ProjectionExec: expr=[c1@0 as c1, CAST(c9@1 AS Decimal128(20, 0)) as c9]
425425
05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
426426
06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], has_header=true
427427
07)----SortExec: TopK(fetch=5), expr=[c9@1 DESC], preserve_partitioning=[true]
428-
08)------ProjectionExec: expr=[c1@0 as c1, CAST(c3@1 AS Int64) as c9]
428+
08)------ProjectionExec: expr=[c1@0 as c1, CAST(c3@1 AS Decimal128(20, 0)) as c9]
429429
09)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
430430
10)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3], has_header=true
431431

432-
query TI
432+
query TR
433433
SELECT c1, c9 FROM aggregate_test_100 UNION ALL SELECT c1, c3 FROM aggregate_test_100 ORDER BY c9 DESC LIMIT 5
434434
----
435435
c 4268716378

0 commit comments

Comments
 (0)