Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions datafusion/common/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3525,6 +3525,12 @@ impl ScalarValue {
}
}
}

/// Compacts ([ScalarValue::compact]) the current [ScalarValue] and returns it.
pub fn compacted(mut self) -> Self {
Comment on lines +3529 to +3530
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would there be ay benefit in adding #[inline] this since its a small function?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 I don't have enough evidence to justify that #[inline] is better here, the function is not really in the hot path of any operation, if you ask me I'd just trust the compiler to do what's right.

self.compact();
self
}
}

pub fn copy_array_data(data: &ArrayData) -> ArrayData {
Expand Down
83 changes: 78 additions & 5 deletions datafusion/functions-aggregate/src/array_agg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ use std::mem::{size_of, size_of_val};
use std::sync::Arc;

use arrow::array::{
new_empty_array, Array, ArrayRef, AsArray, BooleanArray, ListArray, StructArray,
make_array, new_empty_array, Array, ArrayRef, AsArray, BooleanArray, ListArray,
StructArray,
};
use arrow::compute::{filter, SortOptions};
use arrow::datatypes::{DataType, Field, FieldRef, Fields};

use datafusion_common::cast::as_list_array;
use datafusion_common::scalar::copy_array_data;
use datafusion_common::utils::{get_row_at_idx, SingleRowListArrayBuilder};
use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
Expand Down Expand Up @@ -313,7 +315,11 @@ impl Accumulator for ArrayAggAccumulator {
};

if !val.is_empty() {
self.values.push(val);
// The ArrayRef might be holding a reference to its original input buffer, so
// storing it here directly copied/compacted avoids over accounting memory
// not used here.
self.values
.push(make_array(copy_array_data(&val.to_data())));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found this code confusing at first too so I tried to add some additional documentation

Another thing I found might make this code easier to understand would be to refactor this into a function so it looks more like

Suggested change
.push(make_array(copy_array_data(&val.to_data())));
.push(copy_array(val))

Or something like that

/// Copies an array to a new array with mimimal memory overhead
fn copy_array(array: &dyn Array) -> ArrayRef {
..
}

Or something like that .

This is definitely not required just something that occured to me while reviewing

}

Ok(())
Expand Down Expand Up @@ -423,7 +429,8 @@ impl Accumulator for DistinctArrayAggAccumulator {
if nulls.is_none_or(|nulls| nulls.null_count() < val.len()) {
for i in 0..val.len() {
if nulls.is_none_or(|nulls| nulls.is_valid(i)) {
self.values.insert(ScalarValue::try_from_array(val, i)?);
self.values
.insert(ScalarValue::try_from_array(val, i)?.compacted());
}
}
}
Expand Down Expand Up @@ -577,8 +584,14 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
if nulls.is_none_or(|nulls| nulls.null_count() < val.len()) {
for i in 0..val.len() {
if nulls.is_none_or(|nulls| nulls.is_valid(i)) {
self.values.push(ScalarValue::try_from_array(val, i)?);
self.ordering_values.push(get_row_at_idx(ord, i)?)
self.values
.push(ScalarValue::try_from_array(val, i)?.compacted());
self.ordering_values.push(
get_row_at_idx(ord, i)?
.into_iter()
.map(|v| v.compacted())
.collect(),
)
}
}
}
Expand Down Expand Up @@ -714,6 +727,7 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
#[cfg(test)]
mod tests {
use super::*;
use arrow::array::{ListBuilder, StringBuilder};
use arrow::datatypes::{FieldRef, Schema};
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::internal_err;
Expand Down Expand Up @@ -980,6 +994,56 @@ mod tests {
Ok(())
}

#[test]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I verified these tests cover the code in this PR -- they fail without the changes in the PR


assertion `left == right` failed
  left: 2652
 right: 732

fn does_not_over_account_memory() -> Result<()> {
let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string().build_two()?;

acc1.update_batch(&[data(["a", "c", "b"])])?;
acc2.update_batch(&[data(["b", "c", "a"])])?;
acc1 = merge(acc1, acc2)?;

// without compaction, the size is 2652.
assert_eq!(acc1.size(), 732);

Ok(())
}
#[test]
fn does_not_over_account_memory_distinct() -> Result<()> {
let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string()
.distinct()
.build_two()?;

acc1.update_batch(&[string_list_data([
vec!["a", "b", "c"],
vec!["d", "e", "f"],
])])?;
acc2.update_batch(&[string_list_data([vec!["e", "f", "g"]])])?;
acc1 = merge(acc1, acc2)?;

// without compaction, the size is 16660
assert_eq!(acc1.size(), 1660);

Ok(())
}

#[test]
fn does_not_over_account_memory_ordered() -> Result<()> {
let mut acc = ArrayAggAccumulatorBuilder::string()
.order_by_col("col", SortOptions::new(false, false))
.build()?;

acc.update_batch(&[string_list_data([
vec!["a", "b", "c"],
vec!["c", "d", "e"],
vec!["b", "c", "d"],
])])?;

// without compaction, the size is 17112
assert_eq!(acc.size(), 2112);

Ok(())
}

struct ArrayAggAccumulatorBuilder {
return_field: FieldRef,
distinct: bool,
Expand Down Expand Up @@ -1059,6 +1123,15 @@ mod tests {
.collect()
}

fn string_list_data<'a>(data: impl IntoIterator<Item = Vec<&'a str>>) -> ArrayRef {
let mut builder = ListBuilder::new(StringBuilder::new());
for string_list in data.into_iter() {
builder.append_value(string_list.iter().map(Some).collect::<Vec<_>>());
}

Arc::new(builder.finish())
}

fn data<T, const N: usize>(list: [T; N]) -> ArrayRef
where
ScalarValue: From<T>,
Expand Down