Skip to content

Commit

Permalink
Increase fuzz testing of streaming group by / low cardinality columns (
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb authored Oct 30, 2024
1 parent 2d7892b commit 7d34ccc
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 56 deletions.
24 changes: 13 additions & 11 deletions datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ use crate::fuzz_cases::aggregation_fuzzer::{
//
// TODO: test other aggregate functions
// - AVG (unstable given the wide range of inputs)
//
// TODO: specific test for ordering (ensure all group by columns are ordered)
// Currently the data is sorted by random columns, so there are almost no
// repeated runs. To improve coverage we should also sort by lower cardinality columns
#[tokio::test(flavor = "multi_thread")]
async fn test_min() {
let data_gen_config = baseline_config();
Expand All @@ -79,7 +75,7 @@ async fn test_min() {
.with_aggregate_function("min")
// min works on all column types
.with_aggregate_arguments(data_gen_config.all_columns())
.with_group_by_columns(data_gen_config.all_columns());
.set_group_by_columns(data_gen_config.all_columns());

AggregationFuzzerBuilder::from(data_gen_config)
.add_query_builder(query_builder)
Expand All @@ -98,7 +94,7 @@ async fn test_max() {
.with_aggregate_function("max")
// max works on all column types
.with_aggregate_arguments(data_gen_config.all_columns())
.with_group_by_columns(data_gen_config.all_columns());
.set_group_by_columns(data_gen_config.all_columns());

AggregationFuzzerBuilder::from(data_gen_config)
.add_query_builder(query_builder)
Expand All @@ -118,7 +114,7 @@ async fn test_sum() {
.with_distinct_aggregate_function("sum")
// sum only works on numeric columns
.with_aggregate_arguments(data_gen_config.numeric_columns())
.with_group_by_columns(data_gen_config.all_columns());
.set_group_by_columns(data_gen_config.all_columns());

AggregationFuzzerBuilder::from(data_gen_config)
.add_query_builder(query_builder)
Expand All @@ -138,7 +134,7 @@ async fn test_count() {
.with_distinct_aggregate_function("count")
// count work for all arguments
.with_aggregate_arguments(data_gen_config.all_columns())
.with_group_by_columns(data_gen_config.all_columns());
.set_group_by_columns(data_gen_config.all_columns());

AggregationFuzzerBuilder::from(data_gen_config)
.add_query_builder(query_builder)
Expand Down Expand Up @@ -174,15 +170,21 @@ fn baseline_config() -> DatasetGeneratorConfig {
// TODO add support for utf8view in data generator
// ColumnDescr::new("utf8view", DataType::Utf8View),
// todo binary
// low cardinality columns
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10),
];

let min_num_rows = 512;
let max_num_rows = 1024;

DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
rows_num_range: (min_num_rows, max_num_rows),
sort_keys_set: vec![
// low cardinality to try and get many repeated runs
vec![String::from("u8")],
vec![String::from("utf8"), String::from("u8")],
vec![String::from("u8_low")],
vec![String::from("utf8_low"), String::from("u8_low")],
],
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,16 @@ impl Dataset {

#[derive(Debug, Clone)]
pub struct ColumnDescr {
// Column name
/// Column name
name: String,

// Data type of this column
/// Data type of this column
column_type: DataType,

/// The maximum number of distinct values in this column.
///
/// See [`ColumnDescr::with_max_num_distinct`] for more information
max_num_distinct: Option<usize>,
}

impl ColumnDescr {
Expand All @@ -187,8 +192,18 @@ impl ColumnDescr {
Self {
name: name.to_string(),
column_type,
max_num_distinct: None,
}
}

/// set the maximum number of distinct values in this column
///
/// If `None`, the number of distinct values is randomly selected between 1
/// and the number of rows.
pub fn with_max_num_distinct(mut self, num_distinct: usize) -> Self {
self.max_num_distinct = Some(num_distinct);
self
}
}

/// Record batch generator
Expand All @@ -203,20 +218,15 @@ struct RecordBatchGenerator {
}

macro_rules! generate_string_array {
($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
let max_len = $BATCH_GEN_RNG.gen_range(1..50);
let num_distinct_strings = if $NUM_ROWS > 1 {
$BATCH_GEN_RNG.gen_range(1..$NUM_ROWS)
} else {
$NUM_ROWS
};

let mut generator = StringArrayGenerator {
max_len,
num_strings: $NUM_ROWS,
num_distinct_strings,
num_distinct_strings: $MAX_NUM_DISTINCT,
null_pct,
rng: $ARRAY_GEN_RNG,
};
Expand All @@ -226,19 +236,14 @@ macro_rules! generate_string_array {
}

macro_rules! generate_primitive_array {
($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
paste::paste! {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
let num_distinct_primitives = if $NUM_ROWS > 1 {
$BATCH_GEN_RNG.gen_range(1..$NUM_ROWS)
} else {
$NUM_ROWS
};

let mut generator = PrimitiveArrayGenerator {
num_primitives: $NUM_ROWS,
num_distinct_primitives,
num_distinct_primitives: $MAX_NUM_DISTINCT,
null_pct,
rng: $ARRAY_GEN_RNG,
};
Expand Down Expand Up @@ -268,7 +273,7 @@ impl RecordBatchGenerator {
let mut arrays = Vec::with_capacity(self.columns.len());
for col in self.columns.iter() {
let array = self.generate_array_of_type(
col.column_type.clone(),
col,
num_rows,
&mut rng,
array_gen_rng.clone(),
Expand All @@ -289,16 +294,28 @@ impl RecordBatchGenerator {

fn generate_array_of_type(
&self,
data_type: DataType,
col: &ColumnDescr,
num_rows: usize,
batch_gen_rng: &mut ThreadRng,
array_gen_rng: StdRng,
) -> ArrayRef {
match data_type {
let num_distinct = if num_rows > 1 {
batch_gen_rng.gen_range(1..num_rows)
} else {
num_rows
};
// cap to at most the num_distinct values
let max_num_distinct = col
.max_num_distinct
.map(|max| num_distinct.min(max))
.unwrap_or(num_distinct);

match col.column_type {
DataType::Int8 => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Int8Type
Expand All @@ -308,6 +325,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Int16Type
Expand All @@ -317,6 +335,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Int32Type
Expand All @@ -326,6 +345,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Int64Type
Expand All @@ -335,6 +355,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
UInt8Type
Expand All @@ -344,6 +365,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
UInt16Type
Expand All @@ -353,6 +375,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
UInt32Type
Expand All @@ -362,6 +385,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
UInt64Type
Expand All @@ -371,6 +395,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Float32Type
Expand All @@ -380,6 +405,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Float64Type
Expand All @@ -389,6 +415,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Date32Type
Expand All @@ -398,19 +425,34 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Date64Type
)
}
DataType::Utf8 => {
generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i32)
generate_string_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
i32
)
}
DataType::LargeUtf8 => {
generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i64)
generate_string_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
i64
)
}
_ => {
panic!("Unsupported data generator type: {data_type}")
panic!("Unsupported data generator type: {}", col.column_type)
}
}
}
Expand All @@ -435,14 +477,8 @@ mod test {
// - Their rows num should be same and between [16, 32]
let config = DatasetGeneratorConfig {
columns: vec![
ColumnDescr {
name: "a".to_string(),
column_type: DataType::Utf8,
},
ColumnDescr {
name: "b".to_string(),
column_type: DataType::UInt32,
},
ColumnDescr::new("a", DataType::Utf8),
ColumnDescr::new("b", DataType::UInt32),
],
rows_num_range: (16, 32),
sort_keys_set: vec![vec!["b".to_string()]],
Expand Down
Loading

0 comments on commit 7d34ccc

Please sign in to comment.