From d7b132e659db87799b58e820ec86131bba42d18e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 31 Jan 2025 09:11:16 -0500 Subject: [PATCH 1/4] Fix field name during struct equality coercion --- datafusion/sqllogictest/test_files/case.slt | 54 +++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index a339c2aa037e..8d5ebbc8f751 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -308,3 +308,57 @@ NULL NULL false statement ok drop table foo + + +# Test coercion of inner struct names +# Reproducer for https://github.com/apache/datafusion/issues/14383 +statement ok +create table t as values +( + 100, -- column1 int (so the case isn't constant folded) + { 'foo': 'bar' }, -- column2 has List of Struct w/ Utf8 + { 'foo': arrow_cast('baz', 'Utf8View') }, -- column3 has List of Struct w/ Utf8View + { 'xxx': arrow_cast('blarg', 'Utf8View') } -- column4 has List of Struct w/ Utf8View and a different field name +); + + +# Note field names +query ??? +SELECT column2, column3, column4 FROM t; +---- +{foo: bar} {foo: baz} {xxx: blarg} + +# Coerce fields, expect the field name to be the name of the first arg to case +# the field should not be named 'c0' +query ? +SELECT + case + when column1 > 0 then column2 + when column1 < 0 then column3 + else column4 + end +FROM t; +---- +{c0: bar} + +query ? +SELECT + case + when column1 > 0 then column3 -- different arg order affects field name + when column1 < 0 then column4 + else column2 + end +FROM t; +---- +{c0: baz} + +query ? +SELECT + case + when column1 > 0 then column4 -- different arg order affects field name + when column1 < 0 then column2 + else column3 + end +FROM t; +---- +{c0: blarg} From 174c10c220ceb40c3c1778448a78688dc714bed7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 31 Jan 2025 09:15:07 -0500 Subject: [PATCH 2/4] fix bug --- .../expr-common/src/type_coercion/binary.rs | 22 +++++++++++++------ datafusion/sqllogictest/test_files/case.slt | 7 +++--- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 83f47883add9..571c17119427 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -961,23 +961,31 @@ fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option return None; } - let types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) + let coerced_types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), rhs.data_type())) .collect::>>()?; - let fields = types + // preserve the field name and nullability + let orig_fields = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()); + + let fields: Vec = coerced_types .into_iter() - .enumerate() - .map(|(i, datatype)| { - Arc::new(Field::new(format!("c{i}"), datatype, true)) - }) - .collect::>(); + .zip(orig_fields) + .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs)) + .collect(); Some(Struct(fields.into())) } _ => None, } } +/// returns the result of coercing two fields to a common type +fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> FieldRef { + let is_nullable = lhs.is_nullable() || rhs.is_nullable(); + let name = lhs.name(); // pick the name from the left field + Arc::new(Field::new(name, common_type, is_nullable)) +} + /// Returns the output type of applying mathematics operations such as /// `+` to arguments of `lhs_type` and `rhs_type`. fn mathematics_numerical_coercion( diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index 8d5ebbc8f751..7e062600185b 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -339,7 +339,7 @@ SELECT end FROM t; ---- -{c0: bar} +{xxx: bar} query ? SELECT @@ -350,7 +350,7 @@ SELECT end FROM t; ---- -{c0: baz} +{foo: baz} query ? SELECT @@ -361,4 +361,5 @@ SELECT end FROM t; ---- -{c0: blarg} +{foo: blarg} + From f1b357536ade8b3634ce6d7c40b11d3cf40d11fd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 31 Jan 2025 09:25:09 -0500 Subject: [PATCH 3/4] Add more tests --- datafusion/sqllogictest/test_files/case.slt | 59 ++++++++++++++++++++- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index 7e062600185b..3ecfb2c9ca4c 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -310,7 +310,7 @@ statement ok drop table foo -# Test coercion of inner struct names +# Test coercion of inner struct field names # Reproducer for https://github.com/apache/datafusion/issues/14383 statement ok create table t as values @@ -322,7 +322,7 @@ create table t as values ); -# Note field names +# Note field names are foo/foo/xxx query ??? SELECT column2, column3, column4 FROM t; ---- @@ -363,3 +363,58 @@ FROM t; ---- {foo: blarg} +statement ok +drop table t + + +# Test coercion of inner struct field names with different orders / missing fields +statement ok +create table t as values +( + 100, -- column1 int (so the case isn't constant folded) + { 'foo': 'a', 'xxx': 'b' }, -- column2: Struct with fields foo, xxx + { 'xxx': 'c', 'foo': 'd' }, -- column3: Struct with fields xxx, foo + { 'xxx': 'e' } -- column4: Struct with field xxx (no second field) +); + +# Note field names are in different orders +query ??? +SELECT column2, column3, column4 FROM t; +---- +{foo: a, xxx: b} {xxx: c, foo: d} {xxx: e} + +# coerce structs with different field orders, +# (note the *value*s are from column2 but the field name is 'xxx', as the coerced +# type takes the field name from the last argument (column3) +query ? +SELECT + case + when column1 > 0 then column2 -- always true + else column3 + end +FROM t; +---- +{xxx: a, foo: b} + +# coerce structs with different field orders +query ? +SELECT + case + when column1 < 0 then column2 -- always false + else column3 + end +FROM t; +---- +{xxx: c, foo: d} + +# coerce structs with subset of fields +query error Failed to coerce then +SELECT + case + when column1 > 0 then column3 + else column4 + end +FROM t; + +statement ok +drop table t From 68247ad1dcfb64255bf6705642794bfaebed2b17 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 1 Feb 2025 06:29:48 -0500 Subject: [PATCH 4/4] Update tests per #14396 --- datafusion/sqllogictest/test_files/case.slt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index 3ecfb2c9ca4c..46e9c86c7591 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -318,15 +318,15 @@ create table t as values 100, -- column1 int (so the case isn't constant folded) { 'foo': 'bar' }, -- column2 has List of Struct w/ Utf8 { 'foo': arrow_cast('baz', 'Utf8View') }, -- column3 has List of Struct w/ Utf8View - { 'xxx': arrow_cast('blarg', 'Utf8View') } -- column4 has List of Struct w/ Utf8View and a different field name + { 'foo': arrow_cast('blarg', 'Utf8View') } -- column4 has List of Struct w/ Utf8View ); -# Note field names are foo/foo/xxx +# Note field name is foo query ??? SELECT column2, column3, column4 FROM t; ---- -{foo: bar} {foo: baz} {xxx: blarg} +{foo: bar} {foo: baz} {foo: blarg} # Coerce fields, expect the field name to be the name of the first arg to case # the field should not be named 'c0' @@ -339,12 +339,12 @@ SELECT end FROM t; ---- -{xxx: bar} +{foo: bar} query ? SELECT case - when column1 > 0 then column3 -- different arg order affects field name + when column1 > 0 then column3 -- different arg order shouldn't affect name when column1 < 0 then column4 else column2 end @@ -355,7 +355,7 @@ FROM t; query ? SELECT case - when column1 > 0 then column4 -- different arg order affects field name + when column1 > 0 then column4 -- different arg order shouldn't affect name when column1 < 0 then column2 else column3 end