Skip to content

Commit

Permalink
Support list in the fuzzer (only for implemented actions) (#1735)
Browse files Browse the repository at this point in the history
This allows the fuzzer to support array where only some of the compute
functions are implemented
  • Loading branch information
joseph-isaacs authored Jan 3, 2025
1 parent 29da540 commit 8d2fcd9
Show file tree
Hide file tree
Showing 13 changed files with 195 additions and 41 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ cargo-fuzz = true

[dependencies]
libfuzzer-sys = { workspace = true }
arrow-buffer = { workspace = true }
vortex-array = { workspace = true, features = ["arbitrary"] }
vortex-buffer = { workspace = true }
vortex-dtype = { workspace = true, features = ["arbitrary"] }
Expand Down
4 changes: 3 additions & 1 deletion fuzz/fuzz_targets/array_ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
use libfuzzer_sys::{fuzz_target, Corpus};
use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::{
BoolEncoding, PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
BoolEncoding, ListEncoding, PrimitiveEncoding, StructEncoding, VarBinEncoding,
VarBinViewEncoding,
};
use vortex_array::compute::{
filter, scalar_at, search_sorted, slice, take, SearchResult, SearchSortedSide,
Expand Down Expand Up @@ -48,6 +49,7 @@ fuzz_target!(|fuzz_action: FuzzArrayAction| -> Corpus {
&VarBinViewEncoding,
&BoolEncoding,
&StructEncoding,
&ListEncoding,
])
.contains(&current_array.encoding())
{
Expand Down
11 changes: 11 additions & 0 deletions fuzz/src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use vortex_buffer::Buffer;
use vortex_dtype::{match_each_native_ptype, DType};
use vortex_error::VortexExpect;

use crate::take::take_canonical_array;

pub fn filter_canonical_array(array: &ArrayData, filter: &[bool]) -> ArrayData {
let validity = if array.dtype().is_nullable() {
let validity_buff = array
Expand Down Expand Up @@ -83,6 +85,15 @@ pub fn filter_canonical_array(array: &ArrayData, filter: &[bool]) -> ArrayData {
.unwrap()
.into_array()
}
DType::List(..) => {
let mut indices = Vec::new();
for (idx, bool) in filter.iter().enumerate() {
if *bool {
indices.push(idx);
}
}
take_canonical_array(array, &indices)
}
_ => unreachable!("Not a canonical array"),
}
}
35 changes: 33 additions & 2 deletions fuzz/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@ mod take;

use std::fmt::Debug;
use std::iter;
use std::ops::Range;
use std::ops::{Range, RangeInclusive};

use libfuzzer_sys::arbitrary::Error::EmptyChoose;
use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured};
pub use sort::sort_canonical_array;
use vortex_array::aliases::hash_set::HashSet;
use vortex_array::array::ListEncoding;
use vortex_array::compute::{scalar_at, FilterMask, SearchResult, SearchSortedSide};
use vortex_array::encoding::{Encoding, EncodingRef};
use vortex_array::{ArrayDType, ArrayData, IntoArrayData};
use vortex_buffer::Buffer;
use vortex_sampling_compressor::SamplingCompressor;
Expand Down Expand Up @@ -64,10 +67,13 @@ impl<'a> Arbitrary<'a> for FuzzArrayAction {
fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
let array = ArrayData::arbitrary(u)?;
let mut current_array = array.clone();

let valid_actions = actions_for_array(&current_array);

let mut actions = Vec::new();
let action_count = u.int_in_range(1..=4)?;
for _ in 0..action_count {
actions.push(match u.int_in_range(0..=4)? {
actions.push(match random_value_from_list(u, valid_actions.as_slice())? {
0 => {
if actions
.last()
Expand Down Expand Up @@ -164,3 +170,28 @@ fn random_vec_in_range(u: &mut Unstructured<'_>, min: usize, max: usize) -> Resu
})
.collect::<Result<Vec<_>>>()
}

fn random_value_from_list(u: &mut Unstructured<'_>, vec: &[usize]) -> Result<usize> {
u.choose_iter(vec).cloned()
}

const ALL_ACTIONS: RangeInclusive<usize> = 0..=4;

fn actions_for_encoding(encoding: EncodingRef) -> HashSet<usize> {
if ListEncoding::ID == encoding.id() {
// compress, slice and filter
vec![0, 1, 4].into_iter().collect()
} else {
ALL_ACTIONS.collect()
}
}

fn actions_for_array(array: &ArrayData) -> Vec<usize> {
array
.depth_first_traversal()
.map(|child| actions_for_encoding(child.encoding()))
.fold(ALL_ACTIONS.collect::<Vec<_>>(), |mut acc, actions| {
acc.retain(|a| actions.contains(a));
acc
})
}
6 changes: 6 additions & 0 deletions fuzz/src/search_sorted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ pub fn search_sorted_canonical_array(
.collect::<Vec<_>>();
scalar_vals.search_sorted(&scalar.cast(array.dtype()).unwrap(), side)
}
DType::List(..) => {
let scalar_vals = (0..array.len())
.map(|i| scalar_at(array, i).unwrap())
.collect::<Vec<_>>();
scalar_vals.search_sorted(&scalar.cast(array.dtype()).unwrap(), side)
}
_ => unreachable!("Not a canonical array"),
}
}
45 changes: 38 additions & 7 deletions fuzz/src/slice.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use arrow_buffer::ArrowNativeType;
use vortex_array::accessor::ArrayAccessor;
use vortex_array::array::{BoolArray, PrimitiveArray, StructArray, VarBinViewArray};
use vortex_array::array::{BoolArray, ListArray, PrimitiveArray, StructArray, VarBinViewArray};
use vortex_array::validity::{ArrayValidity, Validity};
use vortex_array::variants::StructArrayTrait;
use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant};
use vortex_dtype::{match_each_native_ptype, DType};
use vortex_array::variants::{PrimitiveArrayTrait, StructArrayTrait};
use vortex_array::{ArrayDType, ArrayData, ArrayLen, IntoArrayData, IntoArrayVariant};
use vortex_dtype::{match_each_native_ptype, DType, NativePType};
use vortex_error::VortexExpect;

pub fn slice_canonical_array(array: &ArrayData, start: usize, stop: usize) -> ArrayData {
Expand All @@ -28,10 +29,12 @@ pub fn slice_canonical_array(array: &ArrayData, start: usize, stop: usize) -> Ar
.vortex_expect("Validity length cannot mismatch")
.into_array()
}
DType::Primitive(p, _) => match_each_native_ptype!(p, |$P| {
DType::Primitive(p, _) => {
let primitive_array = array.clone().into_primitive().unwrap();
PrimitiveArray::new(primitive_array.buffer::<$P>().slice(start..stop), validity).into_array()
}),
match_each_native_ptype!(p, |$P| {
PrimitiveArray::new(primitive_array.buffer::<$P>().slice(start..stop), validity).into_array()
})
}
DType::Utf8(_) | DType::Binary(_) => {
let utf8 = array.clone().into_varbinview().unwrap();
let values = utf8
Expand All @@ -55,6 +58,34 @@ pub fn slice_canonical_array(array: &ArrayData, start: usize, stop: usize) -> Ar
.unwrap()
.into_array()
}
DType::List(..) => {
let list_array = array.clone().into_list().unwrap();
let offsets = slice_canonical_array(&list_array.offsets(), start, stop + 1)
.into_primitive()
.unwrap();

let elements = slice_canonical_array(
&list_array.elements(),
offsets.get_as_cast::<u64>(0) as usize,
offsets.get_as_cast::<u64>(offsets.len() - 1) as usize,
);
let offsets = match_each_native_ptype!(offsets.ptype(), |$P| {
shift_offsets::<$P>(offsets)
})
.into_array();
ListArray::try_new(elements, offsets, validity)
.unwrap()
.into_array()
}
_ => unreachable!("Not a canonical array"),
}
}

fn shift_offsets<O: NativePType + ArrowNativeType>(offsets: PrimitiveArray) -> PrimitiveArray {
if offsets.is_empty() {
return offsets;
}
let offsets: Vec<O> = offsets.as_slice().to_vec();
let start = offsets[0];
PrimitiveArray::from_iter(offsets.into_iter().map(|o| o - start).collect::<Vec<_>>())
}
12 changes: 11 additions & 1 deletion fuzz/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,17 @@ pub fn sort_canonical_array(array: &ArrayData) -> ArrayData {
});
take_canonical_array(array, &sort_indices)
}
_ => unreachable!("Not a canonical array"),
DType::List(..) => {
let mut sort_indices = (0..array.len()).collect::<Vec<_>>();
sort_indices.sort_by(|a, b| {
scalar_at(array, *a)
.unwrap()
.partial_cmp(&scalar_at(array, *b).unwrap())
.unwrap()
});
take_canonical_array(array, &sort_indices)
}
a => unreachable!("Not a canonical array {:?}", a),
}
}

Expand Down
44 changes: 34 additions & 10 deletions fuzz/src/take.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
use arrow_buffer::ArrowNativeType;
use vortex_array::accessor::ArrayAccessor;
use vortex_array::array::{BoolArray, PrimitiveArray, StructArray, VarBinViewArray};
use vortex_array::builders::{builder_with_capacity, ArrayBuilderExt};
use vortex_array::compute::scalar_at;
use vortex_array::validity::{ArrayValidity, Validity};
use vortex_array::variants::StructArrayTrait;
use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant};
use vortex_buffer::Buffer;
use vortex_dtype::{match_each_native_ptype, DType};
use vortex_dtype::{match_each_native_ptype, DType, NativePType};
use vortex_error::VortexExpect;

pub fn take_canonical_array(array: &ArrayData, indices: &[usize]) -> ArrayData {
Expand All @@ -31,16 +34,12 @@ pub fn take_canonical_array(array: &ArrayData, indices: &[usize]) -> ArrayData {
.vortex_expect("Validity length cannot mismatch")
.into_array()
}
DType::Primitive(p, _) => match_each_native_ptype!(p, |$P| {
DType::Primitive(p, _) => {
let primitive_array = array.clone().into_primitive().unwrap();
let vec_values = primitive_array
.as_slice::<$P>()
.iter()
.copied()
.collect::<Vec<_>>();
PrimitiveArray::new(indices.iter().map(|i| vec_values[*i]).collect::<Buffer<$P>>(), validity)
.into_array()
}),
match_each_native_ptype!(p, |$P| {
take_primitive::<$P>(primitive_array, validity, indices)
})
}
DType::Utf8(_) | DType::Binary(_) => {
let utf8 = array.clone().into_varbinview().unwrap();
let values = utf8
Expand Down Expand Up @@ -68,6 +67,31 @@ pub fn take_canonical_array(array: &ArrayData, indices: &[usize]) -> ArrayData {
.unwrap()
.into_array()
}
DType::List(..) => {
let mut builder = builder_with_capacity(array.dtype(), indices.len());
for idx in indices {
builder
.append_scalar(&scalar_at(array, *idx).unwrap())
.unwrap();
}
builder.finish().unwrap()
}
_ => unreachable!("Not a canonical array"),
}
}

fn take_primitive<T: NativePType + ArrowNativeType>(
primitive_array: PrimitiveArray,
validity: Validity,
indices: &[usize],
) -> ArrayData {
let vec_values = primitive_array.as_slice::<T>().to_vec();
PrimitiveArray::new(
indices
.iter()
.map(|i| vec_values[*i])
.collect::<Buffer<T>>(),
validity,
)
.into_array()
}
26 changes: 16 additions & 10 deletions vortex-array/src/array/arbitrary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Resu
.vortex_unwrap()
.into_array())
}
DType::List(ldt, n) => random_list(u, ldt, n),
DType::List(ldt, n) => random_list(u, ldt, n, chunk_len),
DType::Extension(..) => {
todo!("Extension arrays are not implemented")
}
Expand All @@ -106,14 +106,19 @@ fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Resu
}
}

fn random_list(u: &mut Unstructured, ldt: &Arc<DType>, n: &Nullability) -> Result<ArrayData> {
fn random_list(
u: &mut Unstructured,
ldt: &Arc<DType>,
n: &Nullability,
chunk_len: Option<usize>,
) -> Result<ArrayData> {
match u.int_in_range(0..=5)? {
0 => random_list_offset::<i16>(u, ldt, n),
1 => random_list_offset::<i32>(u, ldt, n),
2 => random_list_offset::<i64>(u, ldt, n),
3 => random_list_offset::<u16>(u, ldt, n),
4 => random_list_offset::<u32>(u, ldt, n),
5 => random_list_offset::<u64>(u, ldt, n),
0 => random_list_offset::<i16>(u, ldt, n, chunk_len),
1 => random_list_offset::<i32>(u, ldt, n, chunk_len),
2 => random_list_offset::<i64>(u, ldt, n, chunk_len),
3 => random_list_offset::<u16>(u, ldt, n, chunk_len),
4 => random_list_offset::<u32>(u, ldt, n, chunk_len),
5 => random_list_offset::<u64>(u, ldt, n, chunk_len),
_ => unreachable!("int_in_range returns a value in the above range"),
}
}
Expand All @@ -122,14 +127,15 @@ fn random_list_offset<O>(
u: &mut Unstructured,
ldt: &Arc<DType>,
n: &Nullability,
chunk_len: Option<usize>,
) -> Result<ArrayData>
where
O: PrimInt + NativePType,
Scalar: From<O>,
usize: AsPrimitive<O>,
{
let list_len = u.int_in_range(0..=20)?;
let mut builder = ListBuilder::<O>::with_capacity(ldt.clone(), *n, 1);
let list_len = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
let mut builder = ListBuilder::<O>::with_capacity(ldt.clone(), *n, 10);
for _ in 0..list_len {
if matches!(n, Nullability::Nullable) || u.arbitrary::<bool>()? {
let elem_len = u.int_in_range(0..=20)?;
Expand Down
21 changes: 20 additions & 1 deletion vortex-array/src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,14 +225,15 @@ impl ListArray {
mod test {
use std::sync::Arc;

use arrow_buffer::BooleanBuffer;
use vortex_dtype::Nullability;
use vortex_dtype::Nullability::NonNullable;
use vortex_dtype::PType::I32;
use vortex_scalar::Scalar;

use crate::array::list::ListArray;
use crate::array::PrimitiveArray;
use crate::compute::scalar_at;
use crate::compute::{filter, scalar_at, FilterMask};
use crate::validity::Validity;
use crate::{ArrayLen, IntoArrayData};

Expand Down Expand Up @@ -301,4 +302,22 @@ mod test {
scalar_at(&list_from_iter, 1).unwrap()
);
}

#[test]
fn test_simple_list_filter() {
let elements = PrimitiveArray::from_option_iter([None, Some(2), Some(3), Some(4), Some(5)]);
let offsets = PrimitiveArray::from_iter([0, 2, 4, 5]);
let validity = Validity::AllValid;

let list = ListArray::try_new(elements.into_array(), offsets.into_array(), validity)
.unwrap()
.into_array();

let filtered = filter(
&list,
FilterMask::from(BooleanBuffer::from(vec![false, true, true])),
);

assert!(filtered.is_ok())
}
}
8 changes: 7 additions & 1 deletion vortex-array/src/arrow/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,14 @@ impl FromArrowArray<&ArrowStructArray> for ArrayData {

impl<O: OffsetSizeTrait + NativePType> FromArrowArray<&GenericListArray<O>> for ArrayData {
fn from_arrow(value: &GenericListArray<O>, nullable: bool) -> Self {
// Extract the validity of the underlying element array
let elem_nullable = match value.data_type() {
DataType::List(field) => field.is_nullable(),
DataType::LargeList(field) => field.is_nullable(),
dt => vortex_panic!("Invalid data type for ListArray: {dt}"),
};
ListArray::try_new(
Self::from_arrow(value.values().clone(), value.values().is_nullable()),
Self::from_arrow(value.values().clone(), elem_nullable),
// offsets are always non-nullable
ArrayData::from(value.offsets().clone()),
nulls(value.nulls(), nullable),
Expand Down
Loading

0 comments on commit 8d2fcd9

Please sign in to comment.