From 3354a4c5d5b5b6fd3ec55c49fd2f0930b935d07e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 30 Jun 2023 12:45:16 +0100 Subject: [PATCH] Add DictionaryArray::occupancy (#4415) --- arrow-array/Cargo.toml | 5 ++ arrow-array/benches/occupancy.rs | 57 +++++++++++++++++++++++ arrow-array/src/array/dictionary_array.rs | 48 ++++++++++++++++++- 3 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 arrow-array/benches/occupancy.rs diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index f2703bb6fca0..d4f0f9fa0d47 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -56,5 +56,10 @@ simd = ["packed_simd"] [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +criterion = { version = "0.5", default-features = false } [build-dependencies] + +[[bench]] +name = "occupancy" +harness = false diff --git a/arrow-array/benches/occupancy.rs b/arrow-array/benches/occupancy.rs new file mode 100644 index 000000000000..ed4b94351c28 --- /dev/null +++ b/arrow-array/benches/occupancy.rs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::types::Int32Type; +use arrow_array::{DictionaryArray, Int32Array}; +use arrow_buffer::NullBuffer; +use criterion::*; +use rand::{thread_rng, Rng}; +use std::sync::Arc; + +fn gen_dict( + len: usize, + values_len: usize, + occupancy: f64, + null_percent: f64, +) -> DictionaryArray { + let mut rng = thread_rng(); + let values = Int32Array::from(vec![0; values_len]); + let max_key = (values_len as f64 * occupancy) as i32; + let keys = (0..len).map(|_| rng.gen_range(0..max_key)).collect(); + let nulls = (0..len).map(|_| !rng.gen_bool(null_percent)).collect(); + + let keys = Int32Array::new(keys, Some(NullBuffer::new(nulls))); + DictionaryArray::new(keys, Arc::new(values)) +} + +fn criterion_benchmark(c: &mut Criterion) { + for values in [10, 100, 512] { + for occupancy in [1., 0.5, 0.1] { + for null_percent in [0.0, 0.1, 0.5, 0.9] { + let dict = gen_dict(1024, values, occupancy, null_percent); + c.bench_function(&format!("occupancy(values: {values}, occupancy: {occupancy}, null_percent: {null_percent})"), |b| { + b.iter(|| { + black_box(&dict).occupancy() + }); + }); + } + } + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index b9112d103a89..5a2f439a8e0f 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -23,8 +23,9 @@ use crate::{ make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, PrimitiveArray, StringArray, }; +use arrow_buffer::bit_util::set_bit; use arrow_buffer::buffer::NullBuffer; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -549,6 +550,29 @@ impl DictionaryArray { .for_each(|v| *v = op(*v)); Ok(builder.finish()) } + + /// Computes an occupancy mask for this dictionary's values + /// + /// For each value in [`Self::values`] the corresponding bit will be set in the + /// returned mask if it is referenced by a key in this [`DictionaryArray`] + pub fn occupancy(&self) -> BooleanBuffer { + let len = self.values.len(); + let mut builder = BooleanBufferBuilder::new(len); + builder.resize(len); + let slice = builder.as_slice_mut(); + match self.keys.nulls().filter(|n| n.null_count() > 0) { + Some(n) => { + let v = self.keys.values(); + n.valid_indices() + .for_each(|idx| set_bit(slice, v[idx].as_usize())) + } + None => { + let v = self.keys.values(); + v.iter().for_each(|v| set_bit(slice, v.as_usize())) + } + } + builder.finish() + } } /// Constructs a `DictionaryArray` from an array data reference. @@ -1207,4 +1231,26 @@ mod tests { let expected = DictionaryArray::new(keys, Arc::new(values)); assert_eq!(expected, returned); } + + #[test] + fn test_occupancy() { + let keys = Int32Array::new((100..200).collect(), None); + let values = Int32Array::from(vec![0; 1024]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + for (idx, v) in dict.occupancy().iter().enumerate() { + let expected = (100..200).contains(&idx); + assert_eq!(v, expected, "{idx}"); + } + + let keys = Int32Array::new( + (0..100).collect(), + Some((0..100).map(|x| x % 4 == 0).collect()), + ); + let values = Int32Array::from(vec![0; 1024]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + for (idx, v) in dict.occupancy().iter().enumerate() { + let expected = idx % 4 == 0 && idx < 100; + assert_eq!(v, expected, "{idx}"); + } + } }