diff --git a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs index 63a4c85f9e80..8b478111c1f9 100644 --- a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs +++ b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs @@ -20,6 +20,12 @@ use crate::aggregate::tdigest::{TDigest, DEFAULT_MAX_SIZE}; use crate::aggregate::utils::down_cast_any_ref; use crate::expressions::format_state_name; use crate::{AggregateExpr, PhysicalExpr}; +use std::any::Any; +use std::fmt::Debug; +use std::sync::Arc; + +use arrow::array::{Array, RecordBatch}; +use arrow::compute::{filter, is_not_null}; use arrow::{ array::{ ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, @@ -27,14 +33,12 @@ use arrow::{ }, datatypes::{DataType, Field}, }; -use arrow_array::RecordBatch; use arrow_schema::Schema; use datafusion_common::{ downcast_value, internal_err, not_impl_err, plan_err, DataFusionError, Result, ScalarValue, }; use datafusion_expr::{Accumulator, ColumnarValue}; -use std::{any::Any, sync::Arc}; /// APPROX_PERCENTILE_CONT aggregate expression #[derive(Debug)] @@ -383,8 +387,11 @@ impl Accumulator for ApproxPercentileAccumulator { } fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - let values = &values[0]; - let sorted_values = &arrow::compute::sort(values, None)?; + let mut values = Arc::clone(&values[0]); + if values.nulls().is_some() { + values = filter(&values, &is_not_null(&values)?)?; + } + let sorted_values = &arrow::compute::sort(&values, None)?; let sorted_values = ApproxPercentileAccumulator::convert_to_float(sorted_values)?; self.digest = self.digest.merge_sorted_f64(&sorted_values); Ok(()) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 29d21487662c..9b3d17e87e29 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1218,6 +1218,12 @@ SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.9) AS DOUBLE) / 0.834) < 0.05 ---- true +# percentile_cont_with_nulls +query I +SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v); +---- +2 + # csv_query_cube_avg query TIR SELECT c1, c2, AVG(c3) FROM aggregate_test_100 GROUP BY CUBE (c1, c2) ORDER BY c1, c2