From a8dad7e0acfd91030e5f53d5b17849b22441079f Mon Sep 17 00:00:00 2001 From: Huang-Hsiang Cheng Date: Thu, 29 Jan 2026 18:35:12 -0800 Subject: [PATCH 1/3] fix: handle negative lengths in string_space expression The string_space function previously failed with overflow when given negative input values. Now treats negative lengths as zero, returning empty strings. Co-Authored-By: Claude Sonnet 4.5 --- .../src/string_funcs/string_space.rs | 25 +++++++++++++++++-- .../comet/CometStringExpressionSuite.scala | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/native/spark-expr/src/string_funcs/string_space.rs b/native/spark-expr/src/string_funcs/string_space.rs index 00b880730a..3db88d78f1 100644 --- a/native/spark-expr/src/string_funcs/string_space.rs +++ b/native/spark-expr/src/string_funcs/string_space.rs @@ -121,7 +121,7 @@ fn generic_string_space(length: &Int32Array) -> Arr // Gets slice of length array to access it directly for performance. let length_data = length.to_data(); let lengths = length_data.buffers()[0].typed_data::(); - let total = lengths.iter().map(|l| *l as usize).sum::(); + let total = lengths.iter().map(|l| (*l).max(0) as usize).sum::(); let mut values = MutableBuffer::new(total); offsets.push(length_so_far); @@ -130,7 +130,7 @@ fn generic_string_space(length: &Int32Array) -> Arr values.resize(total, blank); (0..array_len).for_each(|i| { - let current_len = lengths[i] as usize; + let current_len = lengths[i].max(0) as usize; length_so_far += OffsetSize::from_usize(current_len).unwrap(); offsets.push(length_so_far); @@ -149,3 +149,24 @@ fn generic_string_space(length: &Int32Array) -> Arr }; make_array(data) } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::StringArray; + use datafusion::common::cast::as_string_array; + + #[test] + fn test_negative_length() { + let input = Int32Array::from(vec![Some(-1), Some(-2), None]); + let args = ColumnarValue::Array(Arc::new(input)); + match spark_string_space(&[args]) { + Ok(ColumnarValue::Array(result)) => { + let actual = as_string_array(&result).unwrap(); + let expected = StringArray::from(vec![Some(""), Some(""), None]); + assert_eq!(actual, &expected) + } + _ => unreachable!(), + } + } +} diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index f9882780c8..662534b3cd 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -376,7 +376,7 @@ class CometStringExpressionSuite extends CometTestBase { } test("string_space") { - withParquetTable((0 until 5).map(i => (i, i + 1)), "tbl") { + withParquetTable((0 until 5).map(i => (-i, i + 1)), "tbl") { checkSparkAnswerAndOperator("SELECT space(_1), space(_2) FROM tbl") } } From dce8ba47642ed788c17b402e1ded245e95355789 Mon Sep 17 00:00:00 2001 From: Huang-Hsiang Cheng Date: Thu, 29 Jan 2026 18:36:18 -0800 Subject: [PATCH 2/3] Fix a typo --- native/spark-expr/src/string_funcs/substring.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/native/spark-expr/src/string_funcs/substring.rs b/native/spark-expr/src/string_funcs/substring.rs index f0ecb4e7e3..5037a6e06f 100644 --- a/native/spark-expr/src/string_funcs/substring.rs +++ b/native/spark-expr/src/string_funcs/substring.rs @@ -61,7 +61,7 @@ impl Display for SubstringExpr { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, - "StringSpace [start: {}, len: {}, child: {}]", + "Substring [start: {}, len: {}, child: {}]", self.start, self.len, self.child ) } From 227ea5f2985829a309ce6f6bc0367a90dc928f9d Mon Sep 17 00:00:00 2001 From: Huang-Hsiang Cheng Date: Thu, 29 Jan 2026 22:14:28 -0800 Subject: [PATCH 3/3] Call max() once; more comments --- native/spark-expr/src/string_funcs/string_space.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/native/spark-expr/src/string_funcs/string_space.rs b/native/spark-expr/src/string_funcs/string_space.rs index 3db88d78f1..4ab5362793 100644 --- a/native/spark-expr/src/string_funcs/string_space.rs +++ b/native/spark-expr/src/string_funcs/string_space.rs @@ -119,9 +119,14 @@ fn generic_string_space(length: &Int32Array) -> Arr let null_bit_buffer = length.to_data().nulls().map(|b| b.buffer().clone()); // Gets slice of length array to access it directly for performance. + // Negative length values are set to zero to match Spark behavior let length_data = length.to_data(); - let lengths = length_data.buffers()[0].typed_data::(); - let total = lengths.iter().map(|l| (*l).max(0) as usize).sum::(); + let lengths: Vec<_> = length_data.buffers()[0] + .typed_data::() + .iter() + .map(|l| (*l).max(0) as usize) + .collect(); + let total = lengths.iter().sum::(); let mut values = MutableBuffer::new(total); offsets.push(length_so_far); @@ -130,7 +135,7 @@ fn generic_string_space(length: &Int32Array) -> Arr values.resize(total, blank); (0..array_len).for_each(|i| { - let current_len = lengths[i].max(0) as usize; + let current_len = lengths[i]; length_so_far += OffsetSize::from_usize(current_len).unwrap(); offsets.push(length_so_far);