Skip to content

Commit

Permalink
Compute whole column variance using numerically stable approach (#16448)
Browse files Browse the repository at this point in the history
We use the pairwise approach of Chan, Golub, and LeVeque (1983).

- Closes #16444

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: #16448
  • Loading branch information
wence- authored Oct 8, 2024
1 parent 219ec0e commit bcf9425
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 123 deletions.
48 changes: 32 additions & 16 deletions cpp/include/cudf/reduction/detail/reduction_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,41 @@ namespace detail {
// intermediate data structure to compute `var`, `std`
template <typename ResultType>
struct var_std {
ResultType value; /// the value
ResultType value_squared; /// the value of squared

CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0)
: value(_value), value_squared(_value_squared){};
// Uses the pairwise approach of Chan, Golub, and LeVeque,
// _Algorithms for computing the sample variance: analysis and
// recommendations_ (1983)
// https://doi.org/10.1080/00031305.1983.10483115
// Also http://www.cs.yale.edu/publications/techreports/tr222.pdf
// This is a modification of Youngs and Cramer's online approach.
ResultType running_sum;
ResultType running_square_deviations;
size_type count;

CUDF_HOST_DEVICE inline var_std(ResultType t = 0, ResultType s = 0, size_type n = 0)
: running_sum(t), running_square_deviations(s), count(n){};

using this_t = var_std<ResultType>;

CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
{
return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared));
// Updates as per equations 1.5a and 1.5b in the paper
// T_{1,m+n} = T_{1,m} + T_{m+1,n+1}
// S_{1,m+n} = S_{1,m} + S_{m+1,n+1} + m/(n(m+n)) * (n/m T_{1,m} - T_{m+1,n+1})**2
// Here the first m samples are in this, the remaining n samples are in rhs.
auto const m = this->count;
auto const n = rhs.count;
// Avoid division by zero.
if (m == 0) { return rhs; }
if (n == 0) { return *this; }
auto const tm = this->running_sum;
auto const tn = rhs.running_sum;
auto const sm = this->running_square_deviations;
auto const sn = rhs.running_square_deviations;
auto const tmn = tm + tn;
auto const diff = ((static_cast<ResultType>(n) / m) * tm) - tn;
// Computing m/n(m+n) as m/n/(m+n) to avoid integer overflow
auto const smn = sm + sn + ((static_cast<ResultType>(m) / n) / (m + n)) * diff * diff;
return {tmn, smn, m + n};
};
};

Expand All @@ -50,10 +74,7 @@ template <typename ResultType>
struct transformer_var_std {
using OutputType = var_std<ResultType>;

CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value)
{
return OutputType(value, value * value);
};
CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) { return {value, 0, 1}; };
};

// ------------------------------------------------------------------------
Expand Down Expand Up @@ -257,12 +278,7 @@ struct variance : public compound_op<variance> {
cudf::size_type const& count,
cudf::size_type const& ddof)
{
ResultType mean = input.value / count;
ResultType asum = input.value_squared;
cudf::size_type div = count - ddof;
ResultType var = asum / div - ((mean * mean) * count) / div;

return var;
return input.running_square_deviations / (count - ddof);
};
};
};
Expand Down
26 changes: 18 additions & 8 deletions cpp/src/reductions/compound.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,18 @@

#include <cudf/dictionary/detail/iterator.cuh>
#include <cudf/reduction/detail/reduction.cuh>
#include <cudf/reduction/detail/reduction_operators.cuh>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <thrust/iterator/transform_iterator.h>

#include <stdexcept>
#include <type_traits>

namespace cudf {
namespace reduction {
namespace compound {
Expand Down Expand Up @@ -53,35 +58,39 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
{
auto const valid_count = col.size() - col.null_count();

// All null input produces all null output
if (valid_count == 0 ||
// Only care about ddof for standard deviation and variance right now
valid_count <= ddof && (std::is_same_v<Op, cudf::reduction::detail::op::standard_deviation> ||
std::is_same_v<Op, cudf::reduction::detail::op::variance>)) {
auto result = cudf::make_fixed_width_scalar(output_dtype, stream, mr);
result->set_valid_async(false, stream);
return result;
}
// reduction by iterator
auto dcol = cudf::column_device_view::create(col, stream);
std::unique_ptr<scalar> result;
Op compound_op{};

if (!cudf::is_dictionary(col.type())) {
if (col.has_nulls()) {
auto it = thrust::make_transform_iterator(
dcol->pair_begin<ElementType, true>(),
compound_op.template get_null_replacing_element_transformer<ResultType>());
result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
it, col.size(), compound_op, valid_count, ddof, stream, mr);
} else {
auto it = thrust::make_transform_iterator(
dcol->begin<ElementType>(), compound_op.template get_element_transformer<ResultType>());
result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
it, col.size(), compound_op, valid_count, ddof, stream, mr);
}
} else {
auto it = thrust::make_transform_iterator(
cudf::dictionary::detail::make_dictionary_pair_iterator<ElementType>(*dcol, col.has_nulls()),
compound_op.template get_null_replacing_element_transformer<ResultType>());
result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
it, col.size(), compound_op, valid_count, ddof, stream, mr);
}

// set scalar is valid
result->set_valid_async(col.null_count() < col.size(), stream);
return result;
};

// @brief result type dispatcher for compound reduction (a.k.a. mean, var, std)
Expand Down Expand Up @@ -137,6 +146,7 @@ struct element_type_dispatcher {
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(ddof >= 0, "ddof must be non-negative", std::domain_error);
return cudf::type_dispatcher(
output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, ddof, stream, mr);
}
Expand Down
131 changes: 54 additions & 77 deletions cpp/tests/reductions/reduction_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@
#include <cudf/types.hpp>
#include <cudf/wrappers/timestamps.hpp>

#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>

#include <algorithm>
#include <iostream>
#include <iterator>
#include <vector>

using aggregation = cudf::aggregation;
Expand Down Expand Up @@ -765,6 +769,25 @@ TYPED_TEST(MultiStepReductionTest, Mean)
expected_value_nulls);
}

template <typename T>
double calc_var(std::vector<T> const& v, int ddof, std::vector<bool> const& mask = {})
{
auto const values = [&]() {
if (mask.empty()) { return v; }
std::vector<T> masked{};
thrust::copy_if(
v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; });
return masked;
}();
auto const valid_count = values.size();
double const mean = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count;
double const sq_sum_of_differences =
std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) {
return acc + std::pow(v - mean, 2);
});
return sq_sum_of_differences / (valid_count - ddof);
}

// This test is disabled for only a Debug build because a compiler error
// documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu
#ifdef NDEBUG
Expand All @@ -777,25 +800,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
std::vector<bool> host_bools({true, true, false, true, true, true, false, true});

auto calc_var = [](std::vector<T>& v, cudf::size_type valid_count, int ddof) {
double mean = std::accumulate(v.begin(), v.end(), double{0});
mean /= valid_count;

double sum_of_sq = std::accumulate(
v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; });

cudf::size_type div = valid_count - ddof;

double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
return var;
};

// test without nulls
std::vector<T> v = convert_values<T>(int_values);
cudf::test::fixed_width_column_wrapper<T> col(v.begin(), v.end());

auto const ddof = 1;
double var = calc_var(v, v.size(), ddof);
double var = calc_var(v, ddof);
double std = std::sqrt(var);
auto var_agg = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
Expand All @@ -811,23 +821,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)

// test with nulls
cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(v, host_bools);
cudf::size_type valid_count =
cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count();
auto replaced_array = replace_nulls(v, host_bools, T{0});

double var_nulls = calc_var(replaced_array, valid_count, ddof);
double std_nulls = std::sqrt(var_nulls);
double var_nulls = calc_var(v, ddof, host_bools);
double std_nulls = std::sqrt(var_nulls);

EXPECT_EQ(this
->template reduction_test<double>(
col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
var_nulls);
EXPECT_EQ(this
->template reduction_test<double>(
col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
std_nulls);
EXPECT_DOUBLE_EQ(this
->template reduction_test<double>(
col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
var_nulls);
EXPECT_DOUBLE_EQ(this
->template reduction_test<double>(
col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
std_nulls);
}

// ----------------------------------------------------------------------------
Expand Down Expand Up @@ -1139,23 +1145,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
std::vector<double> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
std::vector<bool> host_bools({true, true, false, true, true, true, false, true});

auto calc_var = [ddof](std::vector<double>& v, cudf::size_type valid_count) {
double mean = std::accumulate(v.begin(), v.end(), double{0});
mean /= valid_count;

double sum_of_sq = std::accumulate(
v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; });

cudf::size_type div = valid_count - ddof;

double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
return var;
};

// test without nulls
cudf::test::fixed_width_column_wrapper<double> col(int_values.begin(), int_values.end());

double var = calc_var(int_values, int_values.size());
double var = calc_var(int_values, ddof);
double std = std::sqrt(var);
auto var_agg = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
Expand All @@ -1172,23 +1165,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
// test with nulls
cudf::test::fixed_width_column_wrapper<double> col_nulls =
construct_null_column(int_values, host_bools);
cudf::size_type valid_count =
cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count();
auto replaced_array = replace_nulls<double>(int_values, host_bools, int{0});

double var_nulls = calc_var(replaced_array, valid_count);
double var_nulls = calc_var(int_values, ddof, host_bools);
double std_nulls = std::sqrt(var_nulls);

EXPECT_EQ(this
->template reduction_test<double>(
col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
var_nulls);
EXPECT_EQ(this
->template reduction_test<double>(
col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
std_nulls);
EXPECT_DOUBLE_EQ(this
->template reduction_test<double>(
col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
var_nulls);
EXPECT_DOUBLE_EQ(this
->template reduction_test<double>(
col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
std_nulls);
}

//-------------------------------------------------------------------
Expand Down Expand Up @@ -2471,21 +2460,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
std::vector<T> v = convert_values<T>(int_values);
cudf::data_type output_type{cudf::type_to_id<double>()};

auto calc_var = [](std::vector<T> const& v, cudf::size_type valid_count, cudf::size_type ddof) {
double mean = std::accumulate(v.cbegin(), v.cend(), double{0});
mean /= valid_count;
double sum_of_sq = std::accumulate(
v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; });
auto const div = valid_count - ddof;
double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
return var;
};

// test without nulls
cudf::test::dictionary_column_wrapper<T> col(v.begin(), v.end());

cudf::size_type const ddof = 1;
double var = calc_var(v, v.size(), ddof);
double var = calc_var(v, ddof);
double std = std::sqrt(var);
auto var_agg = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
Expand All @@ -2497,15 +2476,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
std::vector<bool> validity({true, true, false, true, true, true, false, true});
cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());

cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true);

double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof);
double var_nulls = calc_var(v, ddof, validity);
double std_nulls = std::sqrt(var_nulls);

EXPECT_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
var_nulls);
EXPECT_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
std_nulls);
EXPECT_DOUBLE_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
var_nulls);
EXPECT_DOUBLE_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
std_nulls);
}

TYPED_TEST(DictionaryReductionTest, NthElement)
Expand Down
4 changes: 2 additions & 2 deletions java/src/test/java/ai/rapids/cudf/ReductionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -612,13 +612,13 @@ void testWithSetOutputType() {
assertEquals(expected, result);
}

try (Scalar expected = Scalar.fromFloat(1.666667f);
try (Scalar expected = Scalar.fromFloat(1.6666666f);
ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4});
Scalar result = cv.variance(DType.FLOAT32)) {
assertEquals(expected, result);
}

try (Scalar expected = Scalar.fromFloat(1.2909945f);
try (Scalar expected = Scalar.fromFloat(1.2909944f);
ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4});
Scalar result = cv.standardDeviation(DType.FLOAT32)) {
assertEquals(expected, result);
Expand Down
10 changes: 8 additions & 2 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,19 +180,25 @@ def var(
min_count: int = 0,
ddof=1,
):
return self._reduce(
result = self._reduce(
"var", skipna=skipna, min_count=min_count, ddof=ddof
)
if result is NA:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
return result

def std(
self,
skipna: bool | None = None,
min_count: int = 0,
ddof=1,
):
return self._reduce(
result = self._reduce(
"std", skipna=skipna, min_count=min_count, ddof=ddof
)
if result is NA:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
return result

def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
skipna = True if skipna is None else skipna
Expand Down
Loading

0 comments on commit bcf9425

Please sign in to comment.