diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 1851ba604ff..08efa9eec66 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -44,11 +44,21 @@ Status AggregateFinalize(KernelContext* ctx, Datum* out) { void AddAggKernel(std::shared_ptr sig, KernelInit init, ScalarAggregateFunction* func, SimdLevel::type simd_level) { - ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge, - AggregateFinalize); + ScalarAggregateKernel kernel(std::move(sig), std::move(init), AggregateConsume, + AggregateMerge, AggregateFinalize); + // Set the simd level + kernel.simd_level = simd_level; + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddAggKernel(std::shared_ptr sig, KernelInit init, + ScalarAggregateFinalize finalize, ScalarAggregateFunction* func, + SimdLevel::type simd_level) { + ScalarAggregateKernel kernel(std::move(sig), std::move(init), AggregateConsume, + AggregateMerge, std::move(finalize)); // Set the simd level kernel.simd_level = simd_level; - DCHECK_OK(func->AddKernel(kernel)); + DCHECK_OK(func->AddKernel(std::move(kernel))); } namespace aggregate { @@ -314,9 +324,7 @@ void AddMinOrMaxAggKernel(ScalarAggregateFunction* func, // Note SIMD level is always NONE, but the convenience kernel will // dispatch to an appropriate implementation - ScalarAggregateKernel kernel(std::move(sig), std::move(init), AggregateConsume, - AggregateMerge, std::move(finalize)); - DCHECK_OK(func->AddKernel(kernel)); + AddAggKernel(std::move(sig), std::move(init), std::move(finalize), func); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h index b0aced3e346..33ccefd4cbd 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h @@ -106,6 +106,10 @@ void AddAggKernel(std::shared_ptr sig, KernelInit init, ScalarAggregateFunction* func, SimdLevel::type simd_level = SimdLevel::NONE); +void AddAggKernel(std::shared_ptr sig, KernelInit init, + ScalarAggregateFinalize finalize, ScalarAggregateFunction* func, + SimdLevel::type simd_level = SimdLevel::NONE); + namespace detail { using arrow::internal::VisitSetBitRunsVoid; diff --git a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc index 52ddb3674b4..0fddf38f575 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc @@ -167,6 +167,13 @@ const FunctionDoc tdigest_doc{ {"array"}, "TDigestOptions"}; +const FunctionDoc approximate_median_doc{ + "Approximate median of a numeric array with T-Digest algorithm", + ("Nulls and NaNs are ignored.\n" + "A null scalar is returned if there is no valid data point."), + {"array"}, + "ScalarAggregateOptions"}; + std::shared_ptr AddTDigestAggKernels() { static auto default_tdigest_options = TDigestOptions::Defaults(); auto func = std::make_shared( @@ -175,10 +182,52 @@ std::shared_ptr AddTDigestAggKernels() { return func; } +std::shared_ptr AddApproximateMedianAggKernels( + const ScalarAggregateFunction* tdigest_func) { + static ScalarAggregateOptions default_scalar_aggregate_options; + + auto median = std::make_shared( + "approximate_median", Arity::Unary(), &approximate_median_doc, + &default_scalar_aggregate_options); + + auto sig = + KernelSignature::Make({InputType(ValueDescr::ANY)}, ValueDescr::Scalar(float64())); + + auto init = [tdigest_func]( + KernelContext* ctx, + const KernelInitArgs& args) -> Result> { + std::vector inputs = args.inputs; + ARROW_ASSIGN_OR_RAISE(auto kernel, tdigest_func->DispatchBest(&inputs)); + const auto& scalar_options = + checked_cast(*args.options); + TDigestOptions options; + // Default q = 0.5 + options.min_count = scalar_options.min_count; + options.skip_nulls = scalar_options.skip_nulls; + KernelInitArgs new_args{kernel, inputs, &options}; + return kernel->init(ctx, new_args); + }; + + auto finalize = [](KernelContext* ctx, Datum* out) -> Status { + Datum temp; + RETURN_NOT_OK(checked_cast(ctx->state())->Finalize(ctx, &temp)); + const auto arr = temp.make_array(); + DCHECK_EQ(arr->length(), 1); + return arr->GetScalar(0).Value(out); + }; + + AddAggKernel(std::move(sig), std::move(init), std::move(finalize), median.get()); + return median; +} + } // namespace void RegisterScalarAggregateTDigest(FunctionRegistry* registry) { - DCHECK_OK(registry->AddFunction(AddTDigestAggKernels())); + auto tdigest = AddTDigestAggKernels(); + DCHECK_OK(registry->AddFunction(tdigest)); + + auto approx_median = AddApproximateMedianAggKernels(tdigest.get()); + DCHECK_OK(registry->AddFunction(approx_median)); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 98bf156195f..7db3f292269 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -3447,5 +3447,79 @@ TEST(TestTDigestKernel, Options) { ResultWith(ArrayFromJSON(ty, "[null]"))); } +TEST(TestTDigestKernel, ApproximateMedian) { + // This is a wrapper for TDigest + for (const auto& ty : {float64(), int64(), uint16()}) { + ScalarAggregateOptions keep_nulls(/*skip_nulls=*/false, /*min_count=*/0); + ScalarAggregateOptions min_count(/*skip_nulls=*/true, /*min_count=*/3); + ScalarAggregateOptions keep_nulls_min_count(/*skip_nulls=*/false, /*min_count=*/3); + + EXPECT_THAT( + CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 3]")}, &keep_nulls), + ResultWith(ScalarFromJSON(float64(), "2.0"))); + EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 3, null]")}, + &keep_nulls), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT( + CallFunction("approximate_median", + {ChunkedArrayFromJSON(ty, {"[1, 2]", "[]", "[3]"})}, &keep_nulls), + ResultWith(ScalarFromJSON(float64(), "2.0"))); + EXPECT_THAT(CallFunction("approximate_median", + {ChunkedArrayFromJSON(ty, {"[1, 2]", "[null]", "[3]"})}, + &keep_nulls), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT( + CallFunction("approximate_median", {ScalarFromJSON(ty, "1")}, &keep_nulls), + ResultWith(ScalarFromJSON(float64(), "1.0"))); + EXPECT_THAT( + CallFunction("approximate_median", {ScalarFromJSON(ty, "null")}, &keep_nulls), + ResultWith(ScalarFromJSON(float64(), "null"))); + + EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 3, null]")}, + &min_count), + ResultWith(ScalarFromJSON(float64(), "2.0"))); + EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, null]")}, + &min_count), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT( + CallFunction("approximate_median", + {ChunkedArrayFromJSON(ty, {"[1, 2]", "[]", "[3]"})}, &keep_nulls), + ResultWith(ScalarFromJSON(float64(), "2.0"))); + EXPECT_THAT(CallFunction("approximate_median", + {ChunkedArrayFromJSON(ty, {"[1, 2]", "[null]", "[3]"})}, + &keep_nulls), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty, "1")}, &min_count), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT( + CallFunction("approximate_median", {ScalarFromJSON(ty, "null")}, &min_count), + ResultWith(ScalarFromJSON(float64(), "null"))); + + EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 3]")}, + &keep_nulls_min_count), + ResultWith(ScalarFromJSON(float64(), "2.0"))); + EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2]")}, + &keep_nulls_min_count), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT(CallFunction("approximate_median", + {ChunkedArrayFromJSON(ty, {"[1, 2]", "[]", "[3]"})}, + &keep_nulls_min_count), + ResultWith(ScalarFromJSON(float64(), "2.0"))); + EXPECT_THAT(CallFunction("approximate_median", + {ChunkedArrayFromJSON(ty, {"[1, 2]", "[null]", "[3]"})}, + &keep_nulls_min_count), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT(CallFunction("approximate_median", {ArrayFromJSON(ty, "[1, 2, 3, null]")}, + &keep_nulls_min_count), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty, "1")}, + &keep_nulls_min_count), + ResultWith(ScalarFromJSON(float64(), "null"))); + EXPECT_THAT(CallFunction("approximate_median", {ScalarFromJSON(ty, "null")}, + &keep_nulls_min_count), + ResultWith(ScalarFromJSON(float64(), "null"))); + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 843d62911a7..38dba0410ee 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1734,6 +1734,37 @@ struct GroupedTDigestFactory { InputType argument_type; }; +HashAggregateKernel MakeApproximateMedianKernel(HashAggregateFunction* tdigest_func) { + HashAggregateKernel kernel; + kernel.init = [tdigest_func]( + KernelContext* ctx, + const KernelInitArgs& args) -> Result> { + std::vector inputs = args.inputs; + ARROW_ASSIGN_OR_RAISE(auto kernel, tdigest_func->DispatchBest(&inputs)); + const auto& scalar_options = + checked_cast(*args.options); + TDigestOptions options; + // Default q = 0.5 + options.min_count = scalar_options.min_count; + options.skip_nulls = scalar_options.skip_nulls; + KernelInitArgs new_args{kernel, inputs, &options}; + return kernel->init(ctx, new_args); + }; + kernel.signature = + KernelSignature::Make({InputType(ValueDescr::ANY), InputType::Array(Type::UINT32)}, + ValueDescr::Array(float64())); + kernel.resize = HashAggregateResize; + kernel.consume = HashAggregateConsume; + kernel.merge = HashAggregateMerge; + kernel.finalize = [](KernelContext* ctx, Datum* out) { + ARROW_ASSIGN_OR_RAISE(Datum temp, + checked_cast(ctx->state())->Finalize()); + *out = temp.array_as()->values(); + return Status::OK(); + }; + return kernel; +} + // ---------------------------------------------------------------------- // MinMax implementation @@ -2636,6 +2667,13 @@ const FunctionDoc hash_tdigest_doc{ {"array", "group_id_array"}, "TDigestOptions"}; +const FunctionDoc hash_approximate_median_doc{ + "Calculate approximate medians of a numeric array with the T-Digest algorithm", + ("Nulls and NaNs are ignored.\n" + "Null is emitted for a group if there are no valid data points."), + {"array", "group_id_array"}, + "ScalarAggregateOptions"}; + const FunctionDoc hash_min_max_doc{ "Compute the minimum and maximum values of a numeric array", ("Null values are ignored by default.\n" @@ -2760,6 +2798,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } + HashAggregateFunction* tdigest_func = nullptr; { auto func = std::make_shared( "hash_tdigest", Arity::Binary(), &hash_tdigest_doc, &default_tdigest_options); @@ -2769,6 +2808,15 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { AddHashAggKernels(UnsignedIntTypes(), GroupedTDigestFactory::Make, func.get())); DCHECK_OK( AddHashAggKernels(FloatingPointTypes(), GroupedTDigestFactory::Make, func.get())); + tdigest_func = func.get(); + DCHECK_OK(registry->AddFunction(std::move(func))); + } + + { + auto func = std::make_shared( + "hash_approximate_median", Arity::Binary(), &hash_approximate_median_doc, + &default_scalar_aggregate_options); + DCHECK_OK(func->AddKernel(MakeApproximateMedianKernel(tdigest_func))); DCHECK_OK(registry->AddFunction(std::move(func))); } diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index f90e71bf670..a43a96767f2 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -1154,6 +1154,70 @@ TEST(GroupBy, TDigest) { /*verbose=*/true); } +TEST(GroupBy, ApproximateMedian) { + for (const auto& type : {float64(), int8()}) { + auto batch = + RecordBatchFromJSON(schema({field("argument", type), field("key", int64())}), R"([ + [1, 1], + [null, 1], + [0, 2], + [null, 3], + [1, 4], + [4, null], + [3, 1], + [0, 2], + [-1, 2], + [1, null], + [null, 3], + [1, 4], + [1, 4], + [null, 4] + ])"); + + ScalarAggregateOptions options; + ScalarAggregateOptions keep_nulls( + /*skip_nulls=*/false, /*min_count=*/0); + ScalarAggregateOptions min_count( + /*skip_nulls=*/true, /*min_count=*/3); + ScalarAggregateOptions keep_nulls_min_count( + /*skip_nulls=*/false, /*min_count=*/3); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + internal::GroupBy( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_approximate_median", &options}, + {"hash_approximate_median", &keep_nulls}, + {"hash_approximate_median", &min_count}, + {"hash_approximate_median", &keep_nulls_min_count}, + })); + + AssertDatumsApproxEqual(ArrayFromJSON(struct_({ + field("hash_approximate_median", float64()), + field("hash_approximate_median", float64()), + field("hash_approximate_median", float64()), + field("hash_approximate_median", float64()), + field("key_0", int64()), + }), + R"([ + [1.0, null, null, null, 1], + [0.0, 0.0, 0.0, 0.0, 2], + [null, null, null, null, 3], + [1.0, null, 1.0, null, 4], + [1.0, 1.0, null, null, null] + ])"), + aggregated_and_grouped, + /*verbose=*/true); + } +} + TEST(GroupBy, StddevVarianceTDigestScalar) { BatchesWithSchema input; input.batches = { diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index b10c7a120b2..acf0f1f0727 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -188,39 +188,41 @@ Aggregations Scalar aggregations operate on a (chunked) array or scalar value and reduce the input to a single output value. -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+===============+=======+==================+========================+==================================+=======+ -| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | \(2) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| index | Unary | Any | Scalar Int64 | :struct:`IndexOptions` | | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| max | Unary | Non-nested types | Scalar Input type | :struct:`ScalarAggregateOptions` | | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| mean | Unary | Numeric | Scalar Decimal/Float64 | :struct:`ScalarAggregateOptions` | | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| min | Unary | Non-nested types | Scalar Input type | :struct:`ScalarAggregateOptions` | | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| min_max | Unary | Non-nested types | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| mode | Unary | Numeric | Struct | :struct:`ModeOptions` | \(4) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| product | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| quantile | Unary | Numeric | Scalar Numeric | :struct:`QuantileOptions` | \(6) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(7) | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ -| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | -+---------------+-------+------------------+------------------------+----------------------------------+-------+ ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++====================+=======+==================+========================+==================================+=======+ +| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| approximate_median | Unary | Numeric | Scalar Float64 | :struct:`ScalarAggregateOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | \(2) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| index | Unary | Any | Scalar Int64 | :struct:`IndexOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| max | Unary | Non-nested types | Scalar Input type | :struct:`ScalarAggregateOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| mean | Unary | Numeric | Scalar Decimal/Float64 | :struct:`ScalarAggregateOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| min | Unary | Non-nested types | Scalar Input type | :struct:`ScalarAggregateOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| min_max | Unary | Non-nested types | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| mode | Unary | Numeric | Struct | :struct:`ModeOptions` | \(4) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| product | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| quantile | Unary | Numeric | Scalar Numeric | :struct:`QuantileOptions` | \(6) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| tdigest | Unary | Numeric | Float64 | :struct:`TDigestOptions` | \(7) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ Notes: @@ -298,37 +300,39 @@ The supported aggregation functions are as follows. All function names are prefixed with ``hash_``, which differentiates them from their scalar equivalents above and reflects how they are implemented internally. -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=====================+=======+====================================+=================+==================================+=======+ -| hash_all | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_any | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_count | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_count_distinct | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_distinct | Unary | Any | Input type | :struct:`CountOptions` | \(2) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_max | Unary | Non-nested, non-binary/string-like | Input type | :struct:`ScalarAggregateOptions` | | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_mean | Unary | Numeric | Decimal/Float64 | :struct:`ScalarAggregateOptions` | | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_min | Unary | Non-nested, non-binary/string-like | Input type | :struct:`ScalarAggregateOptions` | | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_min_max | Unary | Non-nested, non-binary/string-like | Struct | :struct:`ScalarAggregateOptions` | \(3) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_product | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_stddev | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_sum | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_tdigest | Unary | Numeric | Float64 | :struct:`TDigestOptions` | \(5) | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ -| hash_variance | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | -+---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=========================+=======+====================================+========================+==================================+=======+ +| hash_all | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_any | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_approximate_median | Unary | Numeric | Float64 | :struct:`ScalarAggregateOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_count | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_count_distinct | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_distinct | Unary | Any | Input type | :struct:`CountOptions` | \(2) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_max | Unary | Non-nested, non-binary/string-like | Input type | :struct:`ScalarAggregateOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_mean | Unary | Numeric | Decimal/Float64 | :struct:`ScalarAggregateOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_min | Unary | Non-nested, non-binary/string-like | Input type | :struct:`ScalarAggregateOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_min_max | Unary | Non-nested, non-binary/string-like | Struct | :struct:`ScalarAggregateOptions` | \(3) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_product | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_stddev | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_sum | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_tdigest | Unary | Numeric | FixedSizeList[Float64] | :struct:`TDigestOptions` | \(5) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_variance | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ * \(1) If null values are taken into account, by setting the :member:`ScalarAggregateOptions::skip_nulls` to false, then `Kleene logic`_ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 00a59b8eef9..c3dffc836c1 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -27,14 +27,19 @@ Aggregations .. autosummary:: :toctree: ../generated/ + approximate_median count index + max mean + min min_max mode product + quantile stddev sum + tdigest variance Arithmetic Functions