diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index c4186c39317c..a7e11bd85a8f 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -108,6 +108,11 @@ harness = false name = "encoding" required-features = ["encoding_expressions"] +[[bench]] +harness = false +name = "uuid" +required-features = ["string_expressions"] + [[bench]] harness = false name = "regx" diff --git a/datafusion/functions/benches/uuid.rs b/datafusion/functions/benches/uuid.rs new file mode 100644 index 000000000000..95cf77de3190 --- /dev/null +++ b/datafusion/functions/benches/uuid.rs @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_functions::string; + +fn criterion_benchmark(c: &mut Criterion) { + let uuid = string::uuid(); + c.bench_function("uuid", |b| { + b.iter(|| black_box(uuid.invoke_batch(&[], 1024))) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/string/uuid.rs b/datafusion/functions/src/string/uuid.rs index f6d6a941068d..64065c26b7d4 100644 --- a/datafusion/functions/src/string/uuid.rs +++ b/datafusion/functions/src/string/uuid.rs @@ -18,9 +18,10 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::GenericStringArray; +use arrow::array::GenericStringBuilder; use arrow::datatypes::DataType; use arrow::datatypes::DataType::Utf8; +use rand::Rng; use uuid::Uuid; use datafusion_common::{internal_err, Result}; @@ -87,9 +88,25 @@ impl ScalarUDFImpl for UuidFunc { if !args.is_empty() { return internal_err!("{} function does not accept arguments", self.name()); } - let values = std::iter::repeat_with(|| Uuid::new_v4().to_string()).take(num_rows); - let array = GenericStringArray::::from_iter_values(values); - Ok(ColumnarValue::Array(Arc::new(array))) + + // Generate random u128 values + let mut rng = rand::thread_rng(); + let mut randoms = vec![0u128; num_rows]; + rng.fill(&mut randoms[..]); + + let mut builder = + GenericStringBuilder::::with_capacity(num_rows, num_rows * 36); + + let mut buffer = [0u8; 36]; + for x in &mut randoms { + // From Uuid::new_v4(): Mask out the version and variant bits + *x = *x & 0xFFFFFFFFFFFF4FFFBFFFFFFFFFFFFFFF | 0x40008000000000000000; + let uuid = Uuid::from_u128(*x); + let fmt = uuid::fmt::Hyphenated::from_uuid(uuid); + builder.append_value(fmt.encode_lower(&mut buffer)); + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) } fn documentation(&self) -> Option<&Documentation> { diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index b9699dfd5c06..de1dbf74c29b 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -720,6 +720,14 @@ select count(distinct u) from uuid_table; ---- 2 +# must be valid uuidv4 format +query B +SELECT REGEXP_LIKE(uuid(), + '^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$') + AS is_valid; +---- +true + statement ok drop table uuid_table