diff --git a/cpp/src/parquet/column/statistics-test.cc b/cpp/src/parquet/column/statistics-test.cc index d1e1eebd6f4c2..3ed7aab0785b6 100644 --- a/cpp/src/parquet/column/statistics-test.cc +++ b/cpp/src/parquet/column/statistics-test.cc @@ -66,7 +66,8 @@ class TestRowGroupStatistics : public PrimitiveTypedTest { std::string encoded_max = statistics1.EncodeMax(); TypedStats statistics2( - this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0, 0); + this->schema_.Column(0), encoded_min, encoded_max, + this->values_.size(), 0, 0, true); ASSERT_EQ(encoded_min, statistics2.EncodeMin()); ASSERT_EQ(encoded_max, statistics2.EncodeMax()); @@ -235,6 +236,30 @@ void TestRowGroupStatistics::DeepFree(std::vector& val } } +template<> +void TestRowGroupStatistics::TestMinMaxEncode() { + this->GenerateData(1000); + // Test that we encode min max strings correctly + TypedRowGroupStatistics statistics1(this->schema_.Column(0)); + statistics1.Update(this->values_ptr_, this->values_.size(), 0); + std::string encoded_min = statistics1.EncodeMin(); + std::string encoded_max = statistics1.EncodeMax(); + + // encoded is same as unencoded + ASSERT_EQ(encoded_min, std::string((const char*)statistics1.min().ptr, + statistics1.min().len)); + ASSERT_EQ(encoded_max, std::string((const char*)statistics1.max().ptr, + statistics1.max().len)); + + TypedRowGroupStatistics statistics2( + this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0, 0, true); + + ASSERT_EQ(encoded_min, statistics2.EncodeMin()); + ASSERT_EQ(encoded_max, statistics2.EncodeMax()); + ASSERT_EQ(statistics1.min(), statistics2.min()); + ASSERT_EQ(statistics1.max(), statistics2.max()); +} + using TestTypes = ::testing::Types; diff --git a/cpp/src/parquet/column/statistics.cc b/cpp/src/parquet/column/statistics.cc index d8f87859ecd42..d761571c09ea1 100644 --- a/cpp/src/parquet/column/statistics.cc +++ b/cpp/src/parquet/column/statistics.cc @@ -54,7 +54,8 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const typename DType::c_ template TypedRowGroupStatistics::TypedRowGroupStatistics(const ColumnDescriptor* schema, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, - int64_t null_count, int64_t distinct_count, MemoryAllocator* allocator) + int64_t null_count, int64_t distinct_count, bool has_min_max, + MemoryAllocator* allocator) : allocator_(allocator), min_buffer_(0, allocator_), max_buffer_(0, allocator_) { IncrementNumValues(num_values); IncrementNullCount(null_count); @@ -64,7 +65,7 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const ColumnDescriptor* if (!encoded_min.empty()) { PlainDecode(encoded_min, &min_); } if (!encoded_max.empty()) { PlainDecode(encoded_max, &max_); } - has_min_max_ = !encoded_min.empty() && !encoded_max.empty(); + has_min_max_ = has_min_max; } template @@ -160,6 +161,19 @@ void TypedRowGroupStatistics::PlainDecode(const std::string& src, T* dst) decoder.Decode(dst, 1); } +template <> +void TypedRowGroupStatistics::PlainEncode( + const T& src, std::string* dst) { + dst->assign(reinterpret_cast(src.ptr), src.len); +} + +template <> +void TypedRowGroupStatistics::PlainDecode( + const std::string& src, T* dst) { + dst->len = src.size(); + dst->ptr = reinterpret_cast(src.c_str()); +} + template class TypedRowGroupStatistics; template class TypedRowGroupStatistics; template class TypedRowGroupStatistics; diff --git a/cpp/src/parquet/column/statistics.h b/cpp/src/parquet/column/statistics.h index dc576c838c546..a3b28213155c4 100644 --- a/cpp/src/parquet/column/statistics.h +++ b/cpp/src/parquet/column/statistics.h @@ -142,7 +142,8 @@ class TypedRowGroupStatistics : public RowGroupStatistics { TypedRowGroupStatistics(const ColumnDescriptor* schema, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, MemoryAllocator* allocator = default_allocator()); + int64_t distinct_count, bool has_min_max, + MemoryAllocator* allocator = default_allocator()); bool HasMinMax() const override; void Reset() override; @@ -195,6 +196,14 @@ inline void TypedRowGroupStatistics::Copy( *dst = ByteArray(src.len, buffer.data()); } +template <> +void TypedRowGroupStatistics::PlainEncode( + const T& src, std::string* dst); + +template <> +void TypedRowGroupStatistics::PlainDecode( + const std::string& src, T* dst); + using BoolStatistics = TypedRowGroupStatistics; using Int32Statistics = TypedRowGroupStatistics; using Int64Statistics = TypedRowGroupStatistics; diff --git a/cpp/src/parquet/file/metadata.cc b/cpp/src/parquet/file/metadata.cc index d50c9155afcbb..25807061e92b1 100644 --- a/cpp/src/parquet/file/metadata.cc +++ b/cpp/src/parquet/file/metadata.cc @@ -29,7 +29,8 @@ static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { return std::make_shared>(descr, metadata.statistics.min, metadata.statistics.max, metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count); + metadata.statistics.null_count, metadata.statistics.distinct_count, + metadata.statistics.__isset.max || metadata.statistics.__isset.min); } std::shared_ptr MakeColumnStats( diff --git a/cpp/src/parquet/file/reader-internal.cc b/cpp/src/parquet/file/reader-internal.cc index fa193904e64af..5eda79b649932 100644 --- a/cpp/src/parquet/file/reader-internal.cc +++ b/cpp/src/parquet/file/reader-internal.cc @@ -175,14 +175,6 @@ std::unique_ptr SerializedRowGroup::GetColumnPageReader(int i) { std::move(stream), col->compression(), properties_.allocator())); } -template -static std::shared_ptr MakeColumnStats( - const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { - return std::make_shared>(descr, metadata.statistics.min, - metadata.statistics.max, metadata.num_values, metadata.statistics.null_count, - metadata.statistics.distinct_count); -} - // ---------------------------------------------------------------------- // SerializedFile: Parquet on-disk layout