Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3897,6 +3897,7 @@ class TestArrayDataStatistics : public ::testing::Test {
void SetUp() {
valids_ = {1, 0, 1, 1};
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
average_byte_width_ = 4.0;
null_buffer_ = *internal::BytesToBits(valids_);
values_ = {1, 0, 3, -4};
min_ = *std::min_element(values_.begin(), values_.end());
Expand All @@ -3906,6 +3907,8 @@ class TestArrayDataStatistics : public ::testing::Test {
null_count_);
data_->statistics = std::make_shared<ArrayStatistics>();
data_->statistics->null_count = null_count_;
data_->statistics->average_byte_width = average_byte_width_;
data_->statistics->is_average_byte_width_exact = true;
data_->statistics->min = min_;
data_->statistics->is_min_exact = true;
data_->statistics->max = max_;
Expand All @@ -3915,6 +3918,7 @@ class TestArrayDataStatistics : public ::testing::Test {
protected:
std::vector<uint8_t> valids_;
size_t null_count_;
double average_byte_width_;
std::shared_ptr<Buffer> null_buffer_;
std::vector<int32_t> values_;
int64_t min_;
Expand All @@ -3930,6 +3934,11 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());

ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
ASSERT_DOUBLE_EQ(average_byte_width_,
moved_data.statistics->average_byte_width.value());
ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact);

ASSERT_TRUE(moved_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
Expand All @@ -3947,6 +3956,11 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());

ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
ASSERT_DOUBLE_EQ(average_byte_width_,
copied_data.statistics->average_byte_width.value());
ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact);

ASSERT_TRUE(copied_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
Expand All @@ -3966,6 +3980,11 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());

ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
ASSERT_DOUBLE_EQ(average_byte_width_,
moved_data.statistics->average_byte_width.value());
ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact);

ASSERT_TRUE(moved_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
Expand All @@ -3984,6 +4003,11 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());

ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
ASSERT_DOUBLE_EQ(average_byte_width_,
copied_data.statistics->average_byte_width.value());
ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact);

ASSERT_TRUE(copied_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ struct ARROW_EXPORT ArrayStatistics {
/// \brief The number of distinct values, may not be set
std::optional<int64_t> distinct_count = std::nullopt;

/// \brief The average size in bytes of a row in an array, may not be set.
std::optional<double> average_byte_width = std::nullopt;

/// \brief Whether the average size in bytes is exact or not.
bool is_average_byte_width_exact = false;

/// \brief The minimum value, may not be set
std::optional<ValueType> min = std::nullopt;

Expand Down
23 changes: 22 additions & 1 deletion cpp/src/arrow/array/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,17 @@ TEST(TestArrayStatistics, DistinctCount) {
ASSERT_EQ(29, statistics.distinct_count.value());
}

TEST(TestArrayStatistics, AverageByteWidth) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.average_byte_width.has_value());
ASSERT_FALSE(statistics.is_average_byte_width_exact);
statistics.average_byte_width = 4.2;
ASSERT_TRUE(statistics.average_byte_width.has_value());
ASSERT_DOUBLE_EQ(4.2, statistics.average_byte_width.value());
statistics.is_average_byte_width_exact = true;
ASSERT_TRUE(statistics.is_average_byte_width_exact);
}

TEST(TestArrayStatistics, Min) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.min.has_value());
Expand All @@ -65,7 +76,7 @@ TEST(TestArrayStatistics, Max) {
ASSERT_FALSE(statistics.is_max_exact);
}

TEST(TestArrayStatistics, EqualityNonDoulbeValue) {
TEST(TestArrayStatistics, Equals) {
ArrayStatistics statistics1;
ArrayStatistics statistics2;

Expand All @@ -81,6 +92,16 @@ TEST(TestArrayStatistics, EqualityNonDoulbeValue) {
statistics2.distinct_count = 2929;
ASSERT_EQ(statistics1, statistics2);

statistics1.average_byte_width = 2.9;
ASSERT_NE(statistics1, statistics2);
statistics2.average_byte_width = 2.9;
ASSERT_EQ(statistics1, statistics2);

statistics1.is_average_byte_width_exact = true;
ASSERT_NE(statistics1, statistics2);
statistics2.is_average_byte_width_exact = true;
ASSERT_EQ(statistics1, statistics2);

statistics1.min = std::string("world");
ASSERT_NE(statistics1, statistics2);
statistics2.min = std::string("world");
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/compare.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1561,8 +1561,11 @@ bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistic
const EqualOptions& equal_options) {
return left.null_count == right.null_count &&
left.distinct_count == right.distinct_count &&
left.is_average_byte_width_exact == right.is_average_byte_width_exact &&
left.is_min_exact == right.is_min_exact &&
left.is_max_exact == right.is_max_exact &&
ArrayStatisticsValueTypeEquals(left.average_byte_width, right.average_byte_width,
equal_options) &&
ArrayStatisticsValueTypeEquals(left.min, right.min, equal_options) &&
ArrayStatisticsValueTypeEquals(left.max, right.max, equal_options);
}
Expand Down
15 changes: 15 additions & 0 deletions cpp/src/arrow/record_batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,19 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
statistics.start_new_column = false;
}

if (column_statistics->average_byte_width.has_value()) {
statistics.nth_statistics++;
if (column_statistics->is_average_byte_width_exact) {
statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT;
} else {
statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE;
}
statistics.type = float64();
statistics.value = column_statistics->average_byte_width.value();
RETURN_NOT_OK(on_statistics(statistics));
statistics.start_new_column = false;
}

if (column_statistics->min.has_value()) {
statistics.nth_statistics++;
if (column_statistics->is_min_exact) {
Expand Down Expand Up @@ -671,8 +684,10 @@ Result<std::shared_ptr<Array>> RecordBatch::MakeStatisticsArray(
if (statistics.start_new_column) {
RETURN_NOT_OK(builder.Append());
if (statistics.nth_column.has_value()) {
// Add Columns
RETURN_NOT_OK(columns_builder->Append(statistics.nth_column.value()));
} else {
// Add RecordBatch
RETURN_NOT_OK(columns_builder->AppendNull());
}
RETURN_NOT_OK(values_builder->Append());
Expand Down
62 changes: 62 additions & 0 deletions cpp/src/arrow/record_batch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,68 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCount) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}

TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) {
auto schema =
::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())});
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
auto string_array = ArrayFromJSON(utf8(), R"(["aa", "bb", "ccc"])");
string_array->data()->statistics = std::make_shared<ArrayStatistics>();
string_array->data()->statistics->average_byte_width = 2.3;
auto batch = RecordBatch::Make(schema, string_array->length(),
{no_statistics_array, string_array});

ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());

ASSERT_OK_AND_ASSIGN(
auto expected_statistics_array,
MakeStatisticsArray("[null, 1]",
{{
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
},
{
ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE,
}},
{{
ArrayStatistics::ValueType{int64_t{3}},
},
{
ArrayStatistics::ValueType{2.3},
}}));
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}

TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthExact) {
auto schema =
::arrow::schema({field("no-statistics", boolean()), field("float64", float64())});
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
auto float_array = ArrayFromJSON(float64(), R"([1.0, 2.0, 3.0])");
float_array->data()->statistics = std::make_shared<ArrayStatistics>();
float_array->data()->statistics->average_byte_width = 8.0;
float_array->data()->statistics->is_average_byte_width_exact = true;

auto batch = RecordBatch::Make(schema, float_array->length(),
{no_statistics_array, float_array});

ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());

ASSERT_OK_AND_ASSIGN(
auto expected_statistics_array,
MakeStatisticsArray("[null, 1]",
{{
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
},
{
ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT,
}},
{{
ArrayStatistics::ValueType{int64_t{3}},
},
{
ArrayStatistics::ValueType{8.0},
}}));
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}

TEST_F(TestRecordBatch, MakeStatisticsArrayMinExact) {
auto schema =
::arrow::schema({field("no-statistics", boolean()), field("uint32", uint32())});
Expand Down
Loading