Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 37 additions & 5 deletions datafusion/functions/benches/regx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,27 @@
// specific language governing permissions and limitations
// under the License.

use std::hint::black_box;
use std::iter;
use std::sync::Arc;

use arrow::array::builder::StringBuilder;
use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray, StringViewArray};
use arrow::compute::cast;
use arrow::datatypes::DataType;
use arrow::datatypes::{DataType, Field};
use criterion::{Criterion, criterion_group, criterion_main};
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
use datafusion_functions::regex::regexpcount::regexp_count_func;
use datafusion_functions::regex::regexpinstr::regexp_instr_func;
use datafusion_functions::regex::regexplike::regexp_like;
use datafusion_functions::regex::regexplike::{RegexpLikeFunc, regexp_like};
use datafusion_functions::regex::regexpmatch::regexp_match;
use datafusion_functions::regex::regexpreplace::regexp_replace;
use rand::Rng;
use rand::distr::Alphanumeric;
use rand::prelude::IndexedRandom;
use rand::rngs::ThreadRng;
use std::hint::black_box;
use std::iter;
use std::sync::Arc;
fn data(rng: &mut ThreadRng) -> StringArray {
let mut data: Vec<String> = vec![];
for _ in 0..1000 {
Expand Down Expand Up @@ -105,6 +109,8 @@ fn subexp(rng: &mut ThreadRng) -> Int64Array {
}

fn criterion_benchmark(c: &mut Criterion) {
let regexp_like_func = RegexpLikeFunc::new();
let config_options = Arc::new(ConfigOptions::default());
c.bench_function("regexp_count_1000 string", |b| {
let mut rng = rand::rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
Expand Down Expand Up @@ -219,6 +225,32 @@ fn criterion_benchmark(c: &mut Criterion) {
})
});

let scalar_args = vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some("foobarbequebaz".to_string()))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some("(bar)(beque)".to_string()))),
];
let scalar_arg_fields = vec![
Field::new("arg_0", DataType::Utf8, false).into(),
Field::new("arg_1", DataType::Utf8, false).into(),
];
let return_field = Field::new("f", DataType::Boolean, true).into();

c.bench_function("regexp_like scalar utf8", |b| {
b.iter(|| {
black_box(
regexp_like_func
.invoke_with_args(ScalarFunctionArgs {
args: scalar_args.clone(),
arg_fields: scalar_arg_fields.clone(),
number_rows: 1,
return_field: Arc::clone(&return_field),
config_options: Arc::clone(&config_options),
})
.expect("regexp_like scalar should work on valid values"),
)
})
});

c.bench_function("regexp_match_1000", |b| {
let mut rng = rand::rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
Expand Down
179 changes: 158 additions & 21 deletions datafusion/functions/src/regex/regexplike.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

//! Regex expressions

use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
use arrow::array::{
Array, ArrayRef, AsArray, BooleanArray, GenericStringArray, LargeStringArray,
StringArray, StringViewArray,
};
use arrow::compute::kernels::regexp;
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
Expand Down Expand Up @@ -130,29 +133,48 @@ impl ScalarUDFImpl for RegexpLikeFunc {
args: datafusion_expr::ScalarFunctionArgs,
) -> Result<ColumnarValue> {
let args = &args.args;
match args.len() {
2 | 3 => {}
other => {
return exec_err!(
"`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
);
}
}

let len = args
.iter()
.fold(Option::<usize>::None, |acc, arg| match arg {
ColumnarValue::Scalar(_) => acc,
ColumnarValue::Array(a) => Some(a.len()),
});

let is_scalar = len.is_none();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should take the chance here to do further refactors; for example we can use ColumnarValue::values_to_arrays here

let inferred_length = len.unwrap_or(1);
let args = args
if args
.iter()
.map(|arg| arg.to_array(inferred_length))
.collect::<Result<Vec<_>>>()?;

let result = regexp_like(&args);
if is_scalar {
// If all inputs are scalar, keeps output as scalar
let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
result.map(ColumnarValue::Scalar)
} else {
result.map(ColumnarValue::Array)
.all(|arg| matches!(arg, ColumnarValue::Scalar(_)))
{
return regexp_like_scalar(args);
}

match args.as_slice() {
[ColumnarValue::Array(values), ColumnarValue::Scalar(pattern)] => {
let pattern = scalar_string(pattern)?;
let array = regexp_like_array_scalar(values, pattern, None)?;
return Ok(ColumnarValue::Array(array));
}
[
ColumnarValue::Array(values),
ColumnarValue::Scalar(pattern),
ColumnarValue::Scalar(flags),
] => {
let flags = scalar_string(flags)?;
if flags == Some("g") {
return plan_err!(
"regexp_like() does not support the \"global\" option"
);
}
let pattern = scalar_string(pattern)?;
let array = regexp_like_array_scalar(values, pattern, flags)?;
return Ok(ColumnarValue::Array(array));
}
_ => {}
}

let args = ColumnarValue::values_to_arrays(args)?;
regexp_like(&args).map(ColumnarValue::Array)
}

fn simplify(
Expand Down Expand Up @@ -314,6 +336,121 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
}
}

fn scalar_string(value: &ScalarValue) -> Result<Option<&str>> {
match value {
ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) | ScalarValue::Utf8View(v) => {
Ok(v.as_deref())
}
ScalarValue::Null => Ok(None),
_ => internal_err!(
"Unsupported data type {:?} for function `regexp_like`",
value.data_type()
),
}
}

fn regexp_like_array_scalar(
values: &ArrayRef,
pattern: Option<&str>,
flags: Option<&str>,
) -> Result<ArrayRef> {
use DataType::*;

if pattern.is_none() {
return Ok(Arc::new(BooleanArray::new_null(values.len())));
}

let pattern = pattern.unwrap();
let array = match values.data_type() {
Utf8 => {
let array = values.as_string::<i32>();
regexp::regexp_is_match_scalar(array, pattern, flags)?
}
Utf8View => {
let array = values.as_string_view();
regexp::regexp_is_match_scalar(array, pattern, flags)?
}
LargeUtf8 => {
let array = values.as_string::<i64>();
regexp::regexp_is_match_scalar(array, pattern, flags)?
}
other => {
return internal_err!(
"Unsupported data type {other:?} for function `regexp_like`"
);
}
};

Ok(Arc::new(array))
}

fn regexp_like_scalar(args: &[ColumnarValue]) -> Result<ColumnarValue> {
let flags = if args.len() == 3 {
match &args[2] {
ColumnarValue::Scalar(v) => scalar_string(v)?,
_ => {
return internal_err!(
"Unexpected non-scalar argument for function `regexp_like`"
);
}
}
} else {
None
};

if flags == Some("g") {
return plan_err!("regexp_like() does not support the \"global\" option");
}

let value = match &args[0] {
ColumnarValue::Scalar(v) => v,
_ => {
return internal_err!(
"Unexpected non-scalar argument for function `regexp_like`"
);
}
};
let pattern = match &args[1] {
ColumnarValue::Scalar(v) => v,
_ => {
return internal_err!(
"Unexpected non-scalar argument for function `regexp_like`"
);
}
};

let value = scalar_string(value)?;
let pattern = scalar_string(pattern)?;
if value.is_none() || pattern.is_none() {
return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
}

let value = value.unwrap();
let pattern = pattern.unwrap();
let result = match &args[0] {
ColumnarValue::Scalar(ScalarValue::Utf8(_)) => {
let array = StringArray::from(vec![value]);
regexp::regexp_is_match_scalar(&array, pattern, flags)?
}
ColumnarValue::Scalar(ScalarValue::LargeUtf8(_)) => {
let array = LargeStringArray::from(vec![value]);
regexp::regexp_is_match_scalar(&array, pattern, flags)?
}
ColumnarValue::Scalar(ScalarValue::Utf8View(_)) => {
let array = StringViewArray::from(vec![value]);
regexp::regexp_is_match_scalar(&array, pattern, flags)?
}
_ => {
return internal_err!(
"Unsupported data type {:?} for function `regexp_like`",
args[0].data_type()
);
}
};

ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar)
}

fn handle_regexp_like(
values: &ArrayRef,
patterns: &ArrayRef,
Expand Down