Skip to content

Commit 40bb75f

Browse files
authored
Speed up uuid UDF (40x faster) (#14675)
* Add benchmark for uuid * add test for valid v4 format * Speed * avoid to_string
1 parent a749295 commit 40bb75f

File tree

4 files changed

+65
-4
lines changed

4 files changed

+65
-4
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,11 @@ harness = false
108108
name = "encoding"
109109
required-features = ["encoding_expressions"]
110110

111+
[[bench]]
112+
harness = false
113+
name = "uuid"
114+
required-features = ["string_expressions"]
115+
111116
[[bench]]
112117
harness = false
113118
name = "regx"
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
20+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
21+
use datafusion_functions::string;
22+
23+
fn criterion_benchmark(c: &mut Criterion) {
24+
let uuid = string::uuid();
25+
c.bench_function("uuid", |b| {
26+
b.iter(|| black_box(uuid.invoke_batch(&[], 1024)))
27+
});
28+
}
29+
30+
criterion_group!(benches, criterion_benchmark);
31+
criterion_main!(benches);

datafusion/functions/src/string/uuid.rs

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21-
use arrow::array::GenericStringArray;
21+
use arrow::array::GenericStringBuilder;
2222
use arrow::datatypes::DataType;
2323
use arrow::datatypes::DataType::Utf8;
24+
use rand::Rng;
2425
use uuid::Uuid;
2526

2627
use datafusion_common::{internal_err, Result};
@@ -87,9 +88,25 @@ impl ScalarUDFImpl for UuidFunc {
8788
if !args.is_empty() {
8889
return internal_err!("{} function does not accept arguments", self.name());
8990
}
90-
let values = std::iter::repeat_with(|| Uuid::new_v4().to_string()).take(num_rows);
91-
let array = GenericStringArray::<i32>::from_iter_values(values);
92-
Ok(ColumnarValue::Array(Arc::new(array)))
91+
92+
// Generate random u128 values
93+
let mut rng = rand::thread_rng();
94+
let mut randoms = vec![0u128; num_rows];
95+
rng.fill(&mut randoms[..]);
96+
97+
let mut builder =
98+
GenericStringBuilder::<i32>::with_capacity(num_rows, num_rows * 36);
99+
100+
let mut buffer = [0u8; 36];
101+
for x in &mut randoms {
102+
// From Uuid::new_v4(): Mask out the version and variant bits
103+
*x = *x & 0xFFFFFFFFFFFF4FFFBFFFFFFFFFFFFFFF | 0x40008000000000000000;
104+
let uuid = Uuid::from_u128(*x);
105+
let fmt = uuid::fmt::Hyphenated::from_uuid(uuid);
106+
builder.append_value(fmt.encode_lower(&mut buffer));
107+
}
108+
109+
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
93110
}
94111

95112
fn documentation(&self) -> Option<&Documentation> {

datafusion/sqllogictest/test_files/functions.slt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,14 @@ select count(distinct u) from uuid_table;
720720
----
721721
2
722722

723+
# must be valid uuidv4 format
724+
query B
725+
SELECT REGEXP_LIKE(uuid(),
726+
'^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$')
727+
AS is_valid;
728+
----
729+
true
730+
723731
statement ok
724732
drop table uuid_table
725733

0 commit comments

Comments
 (0)