Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 181 additions & 1 deletion datafusion/functions/benches/pad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
// specific language governing permissions and limitations
// under the License.

use arrow::array::{ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
use arrow::array::{
ArrowPrimitiveType, GenericStringBuilder, OffsetSizeTrait, PrimitiveArray,
StringViewBuilder,
};
use arrow::datatypes::{DataType, Field, Int64Type};
use arrow::util::bench_util::{
create_string_array_with_len, create_string_view_array_with_len,
Expand All @@ -30,6 +33,51 @@ use std::hint::black_box;
use std::sync::Arc;
use std::time::Duration;

const UNICODE_STRINGS: &[&str] = &[
"Ñandú",
"Íslensku",
"Þjóðarinnar",
"Ελληνική",
"Иванович",
"データフュージョン",
"José García",
"Ölçü bïrïmï",
"Ÿéšṱëṟḏàÿ",
"Ährenstraße",
];

fn create_unicode_string_array<O: OffsetSizeTrait>(
size: usize,
null_density: f32,
) -> arrow::array::GenericStringArray<O> {
let mut rng = rand::rng();
let mut builder = GenericStringBuilder::<O>::new();
for i in 0..size {
if rng.random::<f32>() < null_density {
builder.append_null();
} else {
builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
}
}
builder.finish()
}

fn create_unicode_string_view_array(
size: usize,
null_density: f32,
) -> arrow::array::StringViewArray {
let mut rng = rand::rng();
let mut builder = StringViewBuilder::with_capacity(size);
for i in 0..size {
if rng.random::<f32>() < null_density {
builder.append_null();
} else {
builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
}
}
builder.finish()
}
Comment on lines +49 to +79

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

There's significant code duplication between create_unicode_string_array and create_unicode_string_view_array. The logic inside both functions is nearly identical. Consider refactoring this to a single generic function or a macro to reduce duplication and improve maintainability.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:good-to-have; category:bug; feedback: The Gemini AI reviewer is correct! The two functions are almost identical and they could be merged into a single more generic one and reused.


struct Filter<Dist> {
dist: Dist,
}
Expand Down Expand Up @@ -67,6 +115,34 @@ where
.collect()
}

/// Create args for pad benchmark with Unicode strings
fn create_unicode_pad_args(
size: usize,
target_len: usize,
use_string_view: bool,
) -> Vec<ColumnarValue> {
let length_array =
Arc::new(create_primitive_array::<Int64Type>(size, 0.0, target_len));

if use_string_view {
let string_array = create_unicode_string_view_array(size, 0.1);
let fill_array = create_unicode_string_view_array(size, 0.1);
vec![
ColumnarValue::Array(Arc::new(string_array)),
ColumnarValue::Array(length_array),
ColumnarValue::Array(Arc::new(fill_array)),
]
} else {
let string_array = create_unicode_string_array::<i32>(size, 0.1);
let fill_array = create_unicode_string_array::<i32>(size, 0.1);
vec![
ColumnarValue::Array(Arc::new(string_array)),
ColumnarValue::Array(length_array),
ColumnarValue::Array(Arc::new(fill_array)),
]
}
Comment on lines +127 to +143

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The if/else blocks contain duplicated logic for constructing the Vec<ColumnarValue>. You can refactor this to reduce code duplication by creating the string and fill arrays within the branches and then constructing the vector once.

Suggested change
if use_string_view {
let string_array = create_unicode_string_view_array(size, 0.1);
let fill_array = create_unicode_string_view_array(size, 0.1);
vec![
ColumnarValue::Array(Arc::new(string_array)),
ColumnarValue::Array(length_array),
ColumnarValue::Array(Arc::new(fill_array)),
]
} else {
let string_array = create_unicode_string_array::<i32>(size, 0.1);
let fill_array = create_unicode_string_array::<i32>(size, 0.1);
vec![
ColumnarValue::Array(Arc::new(string_array)),
ColumnarValue::Array(length_array),
ColumnarValue::Array(Arc::new(fill_array)),
]
}
let (string_array, fill_array): (arrow::array::ArrayRef, arrow::array::ArrayRef) = if use_string_view {
(
std::sync::Arc::new(create_unicode_string_view_array(size, 0.1)),
std::sync::Arc::new(create_unicode_string_view_array(size, 0.1)),
)
} else {
(
std::sync::Arc::new(create_unicode_string_array::<i32>(size, 0.1)),
std::sync::Arc::new(create_unicode_string_array::<i32>(size, 0.1)),
)
};
vec![
ColumnarValue::Array(string_array),
ColumnarValue::Array(length_array),
ColumnarValue::Array(fill_array),
]

Copy link
Owner Author

@martin-augment martin-augment Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:good-to-have; category:bug; feedback: The Gemini AI reviewer is correct! The logic for constructing the ColumnarValues could be simplified. it will be both easier to maintain and it will be more performant.

}

/// Create args for pad benchmark
fn create_pad_args<O: OffsetSizeTrait>(
size: usize,
Expand Down Expand Up @@ -208,6 +284,58 @@ fn criterion_benchmark(c: &mut Criterion) {
},
);

// Utf8 type with Unicode strings
let args = create_unicode_pad_args(size, 20, false);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

group.bench_function(
format!("lpad utf8 unicode [size={size}, target=20]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

// StringView type with Unicode strings
let args = create_unicode_pad_args(size, 20, true);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

group.bench_function(
format!("lpad stringview unicode [size={size}, target=20]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8View, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

group.finish();
}

Expand Down Expand Up @@ -322,6 +450,58 @@ fn criterion_benchmark(c: &mut Criterion) {
},
);

// Utf8 type with Unicode strings
let args = create_unicode_pad_args(size, 20, false);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

group.bench_function(
format!("rpad utf8 unicode [size={size}, target=20]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

// StringView type with Unicode strings
let args = create_unicode_pad_args(size, 20, true);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

group.bench_function(
format!("rpad stringview unicode [size={size}, target=20]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8View, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

group.finish();
}
}
Expand Down
96 changes: 69 additions & 27 deletions datafusion/functions/src/unicode/lpad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ use datafusion_macros::user_doc;
+---------------------------------------------+
```"#,
standard_argument(name = "str", prefix = "String"),
argument(name = "n", description = "String length to pad to."),
argument(
name = "n",
description = "String length to pad to. If the input string is longer than this length, it is truncated (on the right)."
),
argument(
name = "padding_str",
description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
Expand Down Expand Up @@ -225,24 +228,47 @@ where
continue;
}

// Reuse buffers by clearing and refilling
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));

fill_chars_buf.clear();
fill_chars_buf.extend(fill.chars());

if length < graphemes_buf.len() {
builder.append_value(graphemes_buf[..length].concat());
} else if fill_chars_buf.is_empty() {
builder.append_value(string);
if string.is_ascii() && fill.is_ascii() {
// ASCII fast path: byte length == character length,
// so we skip expensive grapheme segmentation.
let str_len = string.len();
if length < str_len {
builder.append_value(&string[..length]);
} else if fill.is_empty() {
builder.append_value(string);
} else {
let pad_len = length - str_len;
let fill_len = fill.len();
let full_reps = pad_len / fill_len;
let remainder = pad_len % fill_len;
for _ in 0..full_reps {
builder.write_str(fill)?;
}
if remainder > 0 {
builder.write_str(&fill[..remainder])?;
}
builder.append_value(string);
}
} else {
for l in 0..length - graphemes_buf.len() {
let c = *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
builder.write_char(c)?;
// Reuse buffers by clearing and refilling
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));

fill_chars_buf.clear();
fill_chars_buf.extend(fill.chars());

if length < graphemes_buf.len() {
builder.append_value(graphemes_buf[..length].concat());
} else if fill_chars_buf.is_empty() {
builder.append_value(string);
} else {
for l in 0..length - graphemes_buf.len() {
let c =
*fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
builder.write_char(c)?;
}
builder.append_value(string);
}
builder.write_str(string)?;
builder.append_value("");
}
} else {
builder.append_null();
Expand All @@ -266,17 +292,28 @@ where
continue;
}

// Reuse buffer by clearing and refilling
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));

if length < graphemes_buf.len() {
builder.append_value(graphemes_buf[..length].concat());
if string.is_ascii() {
// ASCII fast path: byte length == character length
let str_len = string.len();
if length < str_len {
builder.append_value(&string[..length]);
} else {
builder.write_str(" ".repeat(length - str_len).as_str())?;
builder.append_value(string);
}
} else {
builder
.write_str(" ".repeat(length - graphemes_buf.len()).as_str())?;
builder.write_str(string)?;
builder.append_value("");
// Reuse buffer by clearing and refilling
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));

if length < graphemes_buf.len() {
builder.append_value(graphemes_buf[..length].concat());
} else {
builder.write_str(
" ".repeat(length - graphemes_buf.len()).as_str(),
)?;
builder.append_value(string);
}
}
} else {
builder.append_null();
Expand Down Expand Up @@ -523,6 +560,11 @@ mod tests {
None,
Ok(None)
);
test_lpad!(
Some("hello".into()),
ScalarValue::Int64(Some(2i64)),
Ok(Some("he"))
);
test_lpad!(
Some("josé".into()),
ScalarValue::Int64(Some(10i64)),
Expand Down
Loading