Skip to content

Commit 0be8af7

Browse files
jizezhangmkleencj-zhukovSergey Zhukovalamb
authored andcommitted
fix: spark array return type mismatch when inner data type is LargeList (apache#18485)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes apache#123` indicates that this PR will close issue apache#123. --> - This PR came up as part of apache#17964. ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> This PR is intended to fix return type mismatch of spark `array` when inner data type is `LargeList`, e.g. ``` query error SELECT array(arrow_cast(array(1), 'LargeList(Int64)')) ---- DataFusion error: Internal error: Function 'array' returned value of type 'LargeList(Field { name: "element", data_type: LargeList(Field { data_type: Int64, nullable: true }), nullable: true })' while the following type was promised at planning time and expected: 'List(Field { name: "element", data_type: LargeList(Field { data_type: Int64, nullable: true }), nullable: true })'. This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues ``` ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> - Return `List` regardless of whether inner data type is `LargeList` or not. This aligns with the behavior of datafusion `make_array` function. - Remove `return_field_from_args` as `return_type` is already defined and is invoked internally. ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Yes ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> No. --------- Co-authored-by: Michael Kleen <mkleen@gmail.com> Co-authored-by: Sergey Zhukov <62326549+cj-zhukov@users.noreply.github.com> Co-authored-by: Sergey Zhukov <szhukov@aligntech.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> Co-authored-by: bubulalabu <bubulalabububu@gmail.com> Co-authored-by: Vegard Stikbakke <vegard.stikbakke@gmail.com>
1 parent 3dce86f commit 0be8af7

File tree

2 files changed

+31
-15
lines changed

2 files changed

+31
-15
lines changed

datafusion/spark/src/function/array/spark_array.rs

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use arrow::array::{
2424
use arrow::buffer::OffsetBuffer;
2525
use arrow::datatypes::{DataType, Field, FieldRef};
2626
use datafusion_common::utils::SingleRowListArrayBuilder;
27-
use datafusion_common::{plan_datafusion_err, plan_err, Result};
27+
use datafusion_common::{internal_err, plan_datafusion_err, plan_err, Result};
2828
use datafusion_expr::type_coercion::binary::comparison_coercion;
2929
use datafusion_expr::{
3030
ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -72,9 +72,20 @@ impl ScalarUDFImpl for SparkArray {
7272
&self.signature
7373
}
7474

75-
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
75+
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
76+
internal_err!("return_field_from_args should be used instead")
77+
}
78+
79+
fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
80+
let data_types = args
81+
.arg_fields
82+
.iter()
83+
.map(|f| f.data_type())
84+
.cloned()
85+
.collect::<Vec<_>>();
86+
7687
let mut expr_type = DataType::Null;
77-
for arg_type in arg_types {
88+
for arg_type in &data_types {
7889
if !arg_type.equals_datatype(&DataType::Null) {
7990
expr_type = arg_type.clone();
8091
break;
@@ -85,21 +96,12 @@ impl ScalarUDFImpl for SparkArray {
8596
expr_type = DataType::Int32;
8697
}
8798

88-
Ok(DataType::List(Arc::new(Field::new(
99+
let return_type = DataType::List(Arc::new(Field::new(
89100
ARRAY_FIELD_DEFAULT_NAME,
90101
expr_type,
91102
true,
92-
))))
93-
}
103+
)));
94104

95-
fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
96-
let data_types = args
97-
.arg_fields
98-
.iter()
99-
.map(|f| f.data_type())
100-
.cloned()
101-
.collect::<Vec<_>>();
102-
let return_type = self.return_type(&data_types)?;
103105
Ok(Arc::new(Field::new(
104106
"this_field_name_is_irrelevant",
105107
return_type,
@@ -166,7 +168,6 @@ pub fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
166168
.build_list_array(),
167169
))
168170
}
169-
DataType::LargeList(..) => array_array::<i64>(arrays, data_type),
170171
_ => array_array::<i32>(arrays, data_type),
171172
}
172173
}

datafusion/sqllogictest/test_files/spark/array/array.slt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,18 @@ query ?
7070
SELECT array(array(1,2));
7171
----
7272
[[1, 2]]
73+
74+
query ?
75+
SELECT array(arrow_cast(array(1), 'LargeList(Int64)'));
76+
----
77+
[[1]]
78+
79+
query ?
80+
SELECT array(arrow_cast(array(1), 'LargeList(Int64)'), arrow_cast(array(), 'LargeList(Int64)'));
81+
----
82+
[[1], []]
83+
84+
query ?
85+
SELECT array(arrow_cast(array(1,2), 'LargeList(Int64)'), array(3));
86+
----
87+
[[1, 2], [3]]

0 commit comments

Comments
 (0)