diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index e5a27d18d00..0dd75b01f6f 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3897,6 +3897,7 @@ class TestArrayDataStatistics : public ::testing::Test { void SetUp() { valids_ = {1, 0, 1, 1}; null_count_ = std::count(valids_.begin(), valids_.end(), 0); + average_byte_width_ = 4.0; null_buffer_ = *internal::BytesToBits(valids_); values_ = {1, 0, 3, -4}; min_ = *std::min_element(values_.begin(), values_.end()); @@ -3906,6 +3907,8 @@ class TestArrayDataStatistics : public ::testing::Test { null_count_); data_->statistics = std::make_shared(); data_->statistics->null_count = null_count_; + data_->statistics->average_byte_width = average_byte_width_; + data_->statistics->is_average_byte_width_exact = true; data_->statistics->min = min_; data_->statistics->is_min_exact = true; data_->statistics->max = max_; @@ -3915,6 +3918,7 @@ class TestArrayDataStatistics : public ::testing::Test { protected: std::vector valids_; size_t null_count_; + double average_byte_width_; std::shared_ptr null_buffer_; std::vector values_; int64_t min_; @@ -3930,6 +3934,11 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) { ASSERT_TRUE(moved_data.statistics->null_count.has_value()); ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(average_byte_width_, + moved_data.statistics->average_byte_width.value()); + ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact); + ASSERT_TRUE(moved_data.statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); @@ -3947,6 +3956,11 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) { ASSERT_TRUE(copied_data.statistics->null_count.has_value()); ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(average_byte_width_, + copied_data.statistics->average_byte_width.value()); + ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact); + ASSERT_TRUE(copied_data.statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); @@ -3966,6 +3980,11 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) { ASSERT_TRUE(moved_data.statistics->null_count.has_value()); ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(average_byte_width_, + moved_data.statistics->average_byte_width.value()); + ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact); + ASSERT_TRUE(moved_data.statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); @@ -3984,6 +4003,11 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) { ASSERT_TRUE(copied_data.statistics->null_count.has_value()); ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(average_byte_width_, + copied_data.statistics->average_byte_width.value()); + ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact); + ASSERT_TRUE(copied_data.statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 6accd48af77..435c38e861a 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -78,6 +78,12 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief The number of distinct values, may not be set std::optional distinct_count = std::nullopt; + /// \brief The average size in bytes of a row in an array, may not be set. + std::optional average_byte_width = std::nullopt; + + /// \brief Whether the average size in bytes is exact or not. + bool is_average_byte_width_exact = false; + /// \brief The minimum value, may not be set std::optional min = std::nullopt; diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 250c4bb437a..d7dbea7c0f5 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -41,6 +41,17 @@ TEST(TestArrayStatistics, DistinctCount) { ASSERT_EQ(29, statistics.distinct_count.value()); } +TEST(TestArrayStatistics, AverageByteWidth) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.average_byte_width.has_value()); + ASSERT_FALSE(statistics.is_average_byte_width_exact); + statistics.average_byte_width = 4.2; + ASSERT_TRUE(statistics.average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(4.2, statistics.average_byte_width.value()); + statistics.is_average_byte_width_exact = true; + ASSERT_TRUE(statistics.is_average_byte_width_exact); +} + TEST(TestArrayStatistics, Min) { ArrayStatistics statistics; ASSERT_FALSE(statistics.min.has_value()); @@ -65,7 +76,7 @@ TEST(TestArrayStatistics, Max) { ASSERT_FALSE(statistics.is_max_exact); } -TEST(TestArrayStatistics, EqualityNonDoulbeValue) { +TEST(TestArrayStatistics, Equals) { ArrayStatistics statistics1; ArrayStatistics statistics2; @@ -81,6 +92,16 @@ TEST(TestArrayStatistics, EqualityNonDoulbeValue) { statistics2.distinct_count = 2929; ASSERT_EQ(statistics1, statistics2); + statistics1.average_byte_width = 2.9; + ASSERT_NE(statistics1, statistics2); + statistics2.average_byte_width = 2.9; + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_average_byte_width_exact = true; + ASSERT_NE(statistics1, statistics2); + statistics2.is_average_byte_width_exact = true; + ASSERT_EQ(statistics1, statistics2); + statistics1.min = std::string("world"); ASSERT_NE(statistics1, statistics2); statistics2.min = std::string("world"); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 6ece1cb444c..aa041a5bd54 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1561,8 +1561,11 @@ bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistic const EqualOptions& equal_options) { return left.null_count == right.null_count && left.distinct_count == right.distinct_count && + left.is_average_byte_width_exact == right.is_average_byte_width_exact && left.is_min_exact == right.is_min_exact && left.is_max_exact == right.is_max_exact && + ArrayStatisticsValueTypeEquals(left.average_byte_width, right.average_byte_width, + equal_options) && ArrayStatisticsValueTypeEquals(left.min, right.min, equal_options) && ArrayStatisticsValueTypeEquals(left.max, right.max, equal_options); } diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 700e1bb2c9a..04d6890d393 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -530,6 +530,19 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat statistics.start_new_column = false; } + if (column_statistics->average_byte_width.has_value()) { + statistics.nth_statistics++; + if (column_statistics->is_average_byte_width_exact) { + statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT; + } else { + statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE; + } + statistics.type = float64(); + statistics.value = column_statistics->average_byte_width.value(); + RETURN_NOT_OK(on_statistics(statistics)); + statistics.start_new_column = false; + } + if (column_statistics->min.has_value()) { statistics.nth_statistics++; if (column_statistics->is_min_exact) { @@ -671,8 +684,10 @@ Result> RecordBatch::MakeStatisticsArray( if (statistics.start_new_column) { RETURN_NOT_OK(builder.Append()); if (statistics.nth_column.has_value()) { + // Add Columns RETURN_NOT_OK(columns_builder->Append(statistics.nth_column.value())); } else { + // Add RecordBatch RETURN_NOT_OK(columns_builder->AppendNull()); } RETURN_NOT_OK(values_builder->Append()); diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 0572883441f..fab81371718 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -1345,6 +1345,68 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCount) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } +TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array = ArrayFromJSON(utf8(), R"(["aa", "bb", "ccc"])"); + string_array->data()->statistics = std::make_shared(); + string_array->data()->statistics->average_byte_width = 2.3; + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{2.3}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthExact) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("float64", float64())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto float_array = ArrayFromJSON(float64(), R"([1.0, 2.0, 3.0])"); + float_array->data()->statistics = std::make_shared(); + float_array->data()->statistics->average_byte_width = 8.0; + float_array->data()->statistics->is_average_byte_width_exact = true; + + auto batch = RecordBatch::Make(schema, float_array->length(), + {no_statistics_array, float_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{8.0}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + TEST_F(TestRecordBatch, MakeStatisticsArrayMinExact) { auto schema = ::arrow::schema({field("no-statistics", boolean()), field("uint32", uint32())});