From 903e3f4bb7a93639c9655151452d28816e58ddd3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 30 Sep 2024 17:02:35 +0900 Subject: [PATCH] GH-44010: [C++] Add `RecordBatch::MakeStatisticsArray()` It's a convenient function that converts `arrow::ArrayStatistics` in a `arrow::RecordBatch` to `arrow::Array` for the Arrow C data interface. --- cpp/src/arrow/array/array_base.h | 2 +- cpp/src/arrow/array/statistics.h | 21 ++ cpp/src/arrow/c/abi.h | 18 ++ cpp/src/arrow/compare.cc | 6 +- cpp/src/arrow/record_batch.cc | 162 +++++++++++ cpp/src/arrow/record_batch.h | 12 + cpp/src/arrow/record_batch_test.cc | 436 +++++++++++++++++++++++++++++ 7 files changed, 654 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index e4af67d7e5f0b..50dbca5c34edd 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -237,7 +237,7 @@ class ARROW_EXPORT Array { /// This just delegates to calling statistics on the underlying ArrayData /// object which backs this Array. /// - /// \return const ArrayStatistics& + /// \return std::shared_ptr std::shared_ptr statistics() const { return data_->statistics; } protected: diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 523f877bbe429..3ac0d6a4055a0 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -22,6 +22,7 @@ #include #include +#include "arrow/type_fwd.h" #include "arrow/util/visibility.h" namespace arrow { @@ -34,6 +35,22 @@ namespace arrow { struct ARROW_EXPORT ArrayStatistics { using ValueType = std::variant; + static const std::shared_ptr& ValueToArrowType( + const std::optional& value) { + if (!value.has_value()) { + return null(); + } + + struct Visitor { + const std::shared_ptr& operator()(const bool&) { return boolean(); } + const std::shared_ptr& operator()(const int64_t&) { return int64(); } + const std::shared_ptr& operator()(const uint64_t&) { return uint64(); } + const std::shared_ptr& operator()(const double&) { return float64(); } + const std::shared_ptr& operator()(const std::string&) { return utf8(); } + } visitor; + return std::visit(visitor, value.value()); + } + /// \brief The number of null values, may not be set std::optional null_count = std::nullopt; @@ -43,12 +60,16 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief The minimum value, may not be set std::optional min = std::nullopt; + const std::shared_ptr& MinArrowType() { return ValueToArrowType(min); } + /// \brief Whether the minimum value is exact or not bool is_min_exact = false; /// \brief The maximum value, may not be set std::optional max = std::nullopt; + const std::shared_ptr& MaxArrowType() { return ValueToArrowType(max); } + /// \brief Whether the maximum value is exact or not bool is_max_exact = false; diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h index db051fff5ff05..1ab83c04d0f78 100644 --- a/cpp/src/arrow/c/abi.h +++ b/cpp/src/arrow/c/abi.h @@ -80,6 +80,24 @@ struct ArrowArray { void* private_data; }; +# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT "ARROW:average_byte_width:exact" +# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \ + "ARROW:average_byte_width:approximate" +# define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT "ARROW:distinct_count:exact" +# define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \ + "ARROW:distinct_count:approximate" +# define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT "ARROW:max_byte_width:exact" +# define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \ + "ARROW:max_byte_width:approximate" +# define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact" +# define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE "ARROW:max_value:approximate" +# define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact" +# define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE "ARROW:min_value:approximate" +# define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact" +# define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE "ARROW:null_count:approximate" +# define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact" +# define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE "ARROW:row_count:approximate" + #endif // ARROW_C_DATA_INTERFACE #ifndef ARROW_C_DEVICE_DATA_INTERFACE diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index e983b47e39dc4..fbe9869b7c8ec 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1030,8 +1030,10 @@ Status PrintDiff(const Array& left, const Array& right, int64_t left_offset, } if (!left.type()->Equals(right.type())) { - *os << "# Array types differed: " << *left.type() << " vs " << *right.type() - << std::endl; + *os << "# Array types differed:" << std::endl + << *left.type() << std::endl + << "vs" << std::endl + << *right.type() << std::endl; return Status::OK(); } diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index e3a8c0d710cb8..9673a57b36211 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -26,8 +26,13 @@ #include #include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_dict.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/builder_union.h" #include "arrow/array/concatenate.h" #include "arrow/array/validate.h" +#include "arrow/c/abi.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" @@ -465,6 +470,163 @@ Result> RecordBatch::ViewOrCopyTo( return Make(schema_, num_rows(), std::move(copied_columns)); } +Result> RecordBatch::MakeStatisticsArray( + MemoryPool* memory_pool) const { + auto enumerate_statistics = + [&](std::function nth_column, const char* key, + const std::shared_ptr& type, + const ArrayStatistics::ValueType& value)> + yield) { + int nth_statistics = 0; + RETURN_NOT_OK(yield(nth_statistics++, true, std::nullopt, + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, int64(), + ArrayStatistics::ValueType{num_rows_})); + + int num_fields = schema_->num_fields(); + for (int nth_column = 0; nth_column < num_fields; ++nth_column) { + auto statistics = column(nth_column)->statistics(); + if (!statistics) { + continue; + } + + bool start_new_column = true; + if (statistics->null_count.has_value()) { + RETURN_NOT_OK(yield( + nth_statistics++, start_new_column, std::optional(nth_column), + ARROW_STATISTICS_KEY_NULL_COUNT_EXACT, int64(), + ArrayStatistics::ValueType{statistics->null_count.value()})); + start_new_column = false; + } + + if (statistics->distinct_count.has_value()) { + RETURN_NOT_OK(yield( + nth_statistics++, start_new_column, std::optional(nth_column), + ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT, int64(), + ArrayStatistics::ValueType{statistics->distinct_count.value()})); + start_new_column = false; + } + + if (statistics->min.has_value()) { + if (statistics->is_min_exact) { + RETURN_NOT_OK(yield(nth_statistics++, start_new_column, + std::optional(nth_column), + ARROW_STATISTICS_KEY_MIN_VALUE_EXACT, + statistics->MinArrowType(), statistics->min.value())); + } else { + RETURN_NOT_OK(yield(nth_statistics++, start_new_column, + std::optional(nth_column), + ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE, + statistics->MinArrowType(), statistics->min.value())); + } + start_new_column = false; + } + + if (statistics->max.has_value()) { + if (statistics->is_max_exact) { + RETURN_NOT_OK(yield(nth_statistics++, start_new_column, + std::optional(nth_column), + ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, + statistics->MaxArrowType(), statistics->max.value())); + } else { + RETURN_NOT_OK(yield(nth_statistics++, start_new_column, + std::optional(nth_column), + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + statistics->MaxArrowType(), statistics->max.value())); + } + start_new_column = false; + } + } + return Status::OK(); + }; + + std::vector> value_types; + std::vector value_type_indexes; + RETURN_NOT_OK(enumerate_statistics( + [&](int nth_statistics, bool start_new_column, std::optional nth_column, + const char* key, const std::shared_ptr& type, + const ArrayStatistics::ValueType& value) { + int8_t i = 0; + for (const auto& field : value_types) { + if (field->type()->id() == type->id()) { + break; + } + i++; + } + if (i == static_cast(value_types.size())) { + value_types.push_back(field(type->name(), type)); + } + value_type_indexes.push_back(i); + return Status::OK(); + })); + + auto key_type = dictionary(int32(), utf8(), false); + auto value_type = dense_union(value_types); + auto statistics_type = struct_( + {field("column", int32()), field("statistics", map(key_type, value_type, false))}); + + std::vector> field_builders; + auto column_builder = std::make_shared(memory_pool); + field_builders.push_back(std::static_pointer_cast(column_builder)); + auto key_builder = std::make_shared(); + std::vector> value_builders; + for (const auto& value_type : value_types) { + std::unique_ptr value_builder; + RETURN_NOT_OK(MakeBuilder(memory_pool, value_type->type(), &value_builder)); + value_builders.push_back(std::shared_ptr(std::move(value_builder))); + } + auto item_builder = std::make_shared( + memory_pool, std::move(value_builders), value_type); + auto values_builder = std::make_shared( + memory_pool, std::static_pointer_cast(key_builder), + std::static_pointer_cast(item_builder)); + field_builders.push_back(std::static_pointer_cast(values_builder)); + StructBuilder builder(statistics_type, memory_pool, std::move(field_builders)); + + RETURN_NOT_OK(enumerate_statistics( + [&](int nth_statistics, bool start_new_column, std::optional nth_column, + const char* key, const std::shared_ptr& type, + const ArrayStatistics::ValueType& value) { + if (start_new_column) { + RETURN_NOT_OK(builder.Append()); + if (nth_column.has_value()) { + RETURN_NOT_OK(column_builder->Append(nth_column.value())); + } else { + RETURN_NOT_OK(column_builder->AppendNull()); + } + RETURN_NOT_OK(values_builder->Append()); + } + RETURN_NOT_OK(key_builder->Append(key, strlen(key))); + const auto value_type_index = value_type_indexes[nth_statistics]; + RETURN_NOT_OK(item_builder->Append(value_type_index)); + struct Visitor { + ArrayBuilder* builder; + + Status operator()(const bool& value) { + return static_cast(builder)->Append(value); + } + Status operator()(const int64_t& value) { + return static_cast(builder)->Append(value); + } + Status operator()(const uint64_t& value) { + return static_cast(builder)->Append(value); + } + Status operator()(const double& value) { + return static_cast(builder)->Append(value); + } + Status operator()(const std::string& value) { + return static_cast(builder)->Append(value.data(), + value.size()); + } + } visitor; + visitor.builder = value_builders[value_type_index].get(); + RETURN_NOT_OK(std::visit(visitor, value)); + return Status::OK(); + })); + + return builder.Finish(); +} + Status RecordBatch::Validate() const { return ValidateBatch(*this, /*full_validation=*/false); } diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 95596e9c15594..edbefc1c77c13 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -282,6 +282,18 @@ class ARROW_EXPORT RecordBatch { virtual DeviceAllocationType device_type() const = 0; + /// \brief Create a statistics array of this record batch + /// + /// The created array follows the C data interface statistics + /// specification. See + /// https://arrow.apache.org/docs/format/CDataInterfaceStatistics.html + /// for details. + /// + /// \param[in] pool the memory pool to allocate memory from + /// \return the statistics array of this record batch + Result> MakeStatisticsArray( + MemoryPool* pool = default_memory_pool()) const; + protected: RecordBatch(const std::shared_ptr& schema, int64_t num_rows); diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index daf7109075eab..21202c6acb05a 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -25,9 +25,11 @@ #include #include "arrow/array/array_base.h" +#include "arrow/array/array_dict.h" #include "arrow/array/array_nested.h" #include "arrow/array/data.h" #include "arrow/array/util.h" +#include "arrow/c/abi.h" #include "arrow/chunked_array.h" #include "arrow/status.h" #include "arrow/table.h" @@ -980,6 +982,440 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMixedFloat16) { batch1->ToTensor()); } +namespace { +template ::value || + is_number_type::value>> +Result> BuildArray( + const std::vector::CType>& values) { + using BuilderType = typename TypeTraits::BuilderType; + BuilderType builder; + for (const auto& value : values) { + ARROW_RETURN_NOT_OK(builder.Append(value)); + } + return builder.Finish(); +} + +template > +Result> BuildArray(const std::vector& values) { + using BuilderType = typename TypeTraits::BuilderType; + BuilderType builder; + for (const auto& value : values) { + ARROW_RETURN_NOT_OK(builder.Append(value)); + } + return builder.Finish(); +} + +template +std::vector StatisticsValuesToRawValues( + const std::vector& values) { + std::vector raw_values; + for (const auto& value : values) { + raw_values.push_back(std::get(value)); + } + return raw_values; +} + +template ::value>> +Result> BuildArray(const std::vector& values) { + struct Builder { + const std::vector& values_; + explicit Builder(const std::vector& values) + : values_(values) {} + + Result> operator()(const bool&) { + auto values = StatisticsValuesToRawValues(values_); + return BuildArray(values); + } + Result> operator()(const int64_t&) { + auto values = StatisticsValuesToRawValues(values_); + return BuildArray(values); + } + Result> operator()(const uint64_t&) { + auto values = StatisticsValuesToRawValues(values_); + return BuildArray(values); + } + Result> operator()(const double&) { + auto values = StatisticsValuesToRawValues(values_); + return BuildArray(values); + } + Result> operator()(const std::string&) { + auto values = StatisticsValuesToRawValues(values_); + return BuildArray(values); + } + } builder(values); + return std::visit(builder, values[0]); +} + +Result> MakeStatisticsArray( + const std::string& columns_json, + const std::vector>& nested_statistics_keys, + const std::vector>& + nested_statistics_values) { + auto columns_type = int32(); + auto columns_array = ArrayFromJSON(columns_type, columns_json); + const auto n_columns = columns_array->length(); + + // nested_statistics_keys: + // { + // {"ARROW:row_count:exact", "ARROW:null_count:exact"}, + // {"ARROW:max_value:exact"}, + // {"ARROW:max_value:exact", "ARROW:distinct_count:exact"}, + // } + // nested_statistics_values: + // { + // {int64_t{29}, int64_t{1}}, + // {double{2.9}}, + // {double{-2.9}, int64_t{2}}, + // } + // -> + // keys_dictionary: + // { + // "ARROW:row_count:exact", + // "ARROW:null_count:exact", + // "ARROW:max_value:exact", + // "ARROW:distinct_count:exact", + // } + // keys_indices: {0, 1, 2, 2, 3} + // values_types: {int64(), float64()} + // values_type_codes: {0, 1} + // values_values[0]: {int64_t{29}, int64_t{1}, int64_t{2}} + // values_values[1]: {double{2.9}, double{-2.9}} + // values_value_type_ids: {0, 0, 1, 1, 0} + // values_value_offsets: {0, 1, 0, 1, 2} + // statistics_offsets: {0, 2, 3, 5, 5} + std::vector keys_dictionary; + std::vector keys_indices; + std::vector> values_types; + std::vector values_type_codes; + std::vector> values_values; + std::vector values_value_type_ids; + std::vector values_value_offsets; + std::vector statistics_offsets; + + int32_t offset = 0; + std::vector values_value_offset_counters; + for (size_t i = 0; i < nested_statistics_keys.size(); ++i) { + const auto& statistics_keys = nested_statistics_keys[i]; + const auto& statistics_values = nested_statistics_values[i]; + statistics_offsets.push_back(offset); + for (size_t j = 0; j < statistics_keys.size(); ++j) { + const auto& key = statistics_keys[j]; + const auto& value = statistics_values[j]; + ++offset; + + int32_t key_index = 0; + for (; key_index < static_cast(keys_dictionary.size()); ++key_index) { + if (keys_dictionary[key_index] == key) { + break; + } + } + if (key_index == static_cast(keys_dictionary.size())) { + keys_dictionary.push_back(key); + } + keys_indices.push_back(key_index); + + auto values_type = ArrayStatistics::ValueToArrowType(value); + int8_t values_type_code = 0; + for (; values_type_code < static_cast(values_types.size()); + ++values_type_code) { + if (values_types[values_type_code] == values_type) { + break; + } + } + if (values_type_code == static_cast(values_types.size())) { + values_types.push_back(values_type); + values_type_codes.push_back(values_type_code); + values_values.emplace_back(); + values_value_offset_counters.push_back(0); + } + values_values[values_type_code].push_back(value); + values_value_type_ids.push_back(values_type_code); + values_value_offsets.push_back(values_value_offset_counters[values_type_code]++); + } + } + statistics_offsets.push_back(offset); + + auto keys_type = dictionary(int32(), utf8(), false); + std::vector> values_fields; + for (const auto& type : values_types) { + values_fields.push_back(field(type->name(), type)); + } + auto values_type = dense_union(values_fields); + auto statistics_type = map(keys_type, values_type, false); + auto struct_type = + struct_({field("column", columns_type), field("statistics", statistics_type)}); + + ARROW_ASSIGN_OR_RAISE(auto keys_indices_array, BuildArray(keys_indices)); + ARROW_ASSIGN_OR_RAISE(auto keys_dictionary_array, + BuildArray(keys_dictionary)); + ARROW_ASSIGN_OR_RAISE( + auto keys_array, + DictionaryArray::FromArrays(keys_type, keys_indices_array, keys_dictionary_array)); + + std::vector> values_arrays; + for (const auto& values : values_values) { + ARROW_ASSIGN_OR_RAISE(auto values_array, + BuildArray(values)); + values_arrays.push_back(values_array); + } + ARROW_ASSIGN_OR_RAISE(auto values_value_type_ids_array, + BuildArray(values_value_type_ids)); + ARROW_ASSIGN_OR_RAISE(auto values_value_offsets_array, + BuildArray(values_value_offsets)); + auto values_array = std::make_shared( + values_type, values_value_offsets_array->length(), values_arrays, + values_value_type_ids_array->data()->buffers[1], + values_value_offsets_array->data()->buffers[1]); + ARROW_ASSIGN_OR_RAISE(auto statistics_offsets_array, + BuildArray(statistics_offsets)); + ARROW_ASSIGN_OR_RAISE(auto statistics_array, + MapArray::FromArrays(statistics_type, statistics_offsets_array, + keys_array, values_array)); + std::vector> struct_arrays = {std::move(columns_array), + std::move(statistics_array)}; + return std::make_shared(struct_type, n_columns, struct_arrays); +} +}; // namespace + +TEST_F(TestRecordBatch, MakeStatisticsArrayRowCount) { + auto schema = ::arrow::schema({field("int32", int32())}); + auto int32_array = ArrayFromJSON(int32(), "[1, null, -1]"); + auto batch = RecordBatch::Make(schema, int32_array->length(), {int32_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("int32", int32())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto int32_array_data = ArrayFromJSON(int32(), "[1, null, -1]")->data()->Copy(); + int32_array_data->statistics = std::make_shared(); + int32_array_data->statistics->null_count = 1; + auto int32_array = MakeArray(std::move(int32_array_data)); + auto batch = RecordBatch::Make(schema, int32_array->length(), + {no_statistics_array, int32_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_NULL_COUNT_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{int64_t{1}}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCount) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("int32", int32())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto int32_array_data = ArrayFromJSON(int32(), "[1, null, -1]")->data()->Copy(); + int32_array_data->statistics = std::make_shared(); + int32_array_data->statistics->null_count = 1; + int32_array_data->statistics->distinct_count = 2; + auto int32_array = MakeArray(std::move(int32_array_data)); + auto batch = RecordBatch::Make(schema, int32_array->length(), + {no_statistics_array, int32_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_NULL_COUNT_EXACT, + ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{int64_t{1}}, + ArrayStatistics::ValueType{int64_t{2}}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayMinExact) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("uint32", uint32())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto uint32_array_data = ArrayFromJSON(uint32(), "[100, null, 1]")->data()->Copy(); + uint32_array_data->statistics = std::make_shared(); + uint32_array_data->statistics->is_min_exact = true; + uint32_array_data->statistics->min = uint64_t{1}; + auto uint32_array = MakeArray(std::move(uint32_array_data)); + auto batch = RecordBatch::Make(schema, uint32_array->length(), + {no_statistics_array, uint32_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MIN_VALUE_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{uint64_t{1}}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayMinApproximate) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("int32", int32())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto int32_array_data = ArrayFromJSON(int32(), "[1, null, -1]")->data()->Copy(); + int32_array_data->statistics = std::make_shared(); + int32_array_data->statistics->min = -1.0; + auto int32_array = MakeArray(std::move(int32_array_data)); + auto batch = RecordBatch::Make(schema, int32_array->length(), + {no_statistics_array, int32_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{-1.0}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayMaxExact) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("boolean", boolean())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto boolean_array_data = + ArrayFromJSON(boolean(), "[true, null, false]")->data()->Copy(); + boolean_array_data->statistics = std::make_shared(); + boolean_array_data->statistics->is_max_exact = true; + boolean_array_data->statistics->max = true; + auto boolean_array = MakeArray(std::move(boolean_array_data)); + auto batch = RecordBatch::Make(schema, boolean_array->length(), + {no_statistics_array, boolean_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{true}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("float64", float64())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto float64_array_data = ArrayFromJSON(float64(), "[1.0, null, -1.0]")->data()->Copy(); + float64_array_data->statistics = std::make_shared(); + float64_array_data->statistics->min = -1.0; + auto float64_array = MakeArray(std::move(float64_array_data)); + auto batch = RecordBatch::Make(schema, float64_array->length(), + {no_statistics_array, float64_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{-1.0}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayString) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); + string_array_data->statistics = std::make_shared(); + string_array_data->statistics->is_max_exact = true; + string_array_data->statistics->max = "c"; + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{"c"}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + template class TestBatchToTensorColumnMajor : public ::testing::Test {};