Skip to content

Commit

Permalink
Add DictionaryArray::occupancy (#4415)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold authored Jun 30, 2023
1 parent 6667646 commit 3354a4c
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 1 deletion.
5 changes: 5 additions & 0 deletions arrow-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,10 @@ simd = ["packed_simd"]

[dev-dependencies]
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
criterion = { version = "0.5", default-features = false }

[build-dependencies]

[[bench]]
name = "occupancy"
harness = false
57 changes: 57 additions & 0 deletions arrow-array/benches/occupancy.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow_array::types::Int32Type;
use arrow_array::{DictionaryArray, Int32Array};
use arrow_buffer::NullBuffer;
use criterion::*;
use rand::{thread_rng, Rng};
use std::sync::Arc;

fn gen_dict(
len: usize,
values_len: usize,
occupancy: f64,
null_percent: f64,
) -> DictionaryArray<Int32Type> {
let mut rng = thread_rng();
let values = Int32Array::from(vec![0; values_len]);
let max_key = (values_len as f64 * occupancy) as i32;
let keys = (0..len).map(|_| rng.gen_range(0..max_key)).collect();
let nulls = (0..len).map(|_| !rng.gen_bool(null_percent)).collect();

let keys = Int32Array::new(keys, Some(NullBuffer::new(nulls)));
DictionaryArray::new(keys, Arc::new(values))
}

fn criterion_benchmark(c: &mut Criterion) {
for values in [10, 100, 512] {
for occupancy in [1., 0.5, 0.1] {
for null_percent in [0.0, 0.1, 0.5, 0.9] {
let dict = gen_dict(1024, values, occupancy, null_percent);
c.bench_function(&format!("occupancy(values: {values}, occupancy: {occupancy}, null_percent: {null_percent})"), |b| {
b.iter(|| {
black_box(&dict).occupancy()
});
});
}
}
}
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
48 changes: 47 additions & 1 deletion arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ use crate::{
make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType,
PrimitiveArray, StringArray,
};
use arrow_buffer::bit_util::set_bit;
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::ArrowNativeType;
use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
Expand Down Expand Up @@ -549,6 +550,29 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
.for_each(|v| *v = op(*v));
Ok(builder.finish())
}

/// Computes an occupancy mask for this dictionary's values
///
/// For each value in [`Self::values`] the corresponding bit will be set in the
/// returned mask if it is referenced by a key in this [`DictionaryArray`]
pub fn occupancy(&self) -> BooleanBuffer {
let len = self.values.len();
let mut builder = BooleanBufferBuilder::new(len);
builder.resize(len);
let slice = builder.as_slice_mut();
match self.keys.nulls().filter(|n| n.null_count() > 0) {
Some(n) => {
let v = self.keys.values();
n.valid_indices()
.for_each(|idx| set_bit(slice, v[idx].as_usize()))
}
None => {
let v = self.keys.values();
v.iter().for_each(|v| set_bit(slice, v.as_usize()))
}
}
builder.finish()
}
}

/// Constructs a `DictionaryArray` from an array data reference.
Expand Down Expand Up @@ -1207,4 +1231,26 @@ mod tests {
let expected = DictionaryArray::new(keys, Arc::new(values));
assert_eq!(expected, returned);
}

#[test]
fn test_occupancy() {
let keys = Int32Array::new((100..200).collect(), None);
let values = Int32Array::from(vec![0; 1024]);
let dict = DictionaryArray::new(keys, Arc::new(values));
for (idx, v) in dict.occupancy().iter().enumerate() {
let expected = (100..200).contains(&idx);
assert_eq!(v, expected, "{idx}");
}

let keys = Int32Array::new(
(0..100).collect(),
Some((0..100).map(|x| x % 4 == 0).collect()),
);
let values = Int32Array::from(vec![0; 1024]);
let dict = DictionaryArray::new(keys, Arc::new(values));
for (idx, v) in dict.occupancy().iter().enumerate() {
let expected = idx % 4 == 0 && idx < 100;
assert_eq!(v, expected, "{idx}");
}
}
}

0 comments on commit 3354a4c

Please sign in to comment.