Skip to content
This repository was archived by the owner on May 10, 2024. It is now read-only.

Commit cf6992a

Browse files
Deepak Majetiwesm
authored andcommitted
PARQUET-1002: Compute statistics based on Sort Order
@lomereiter, You might also want to take a look since you previously implemented the Statistics API. Author: Deepak Majeti <deepak.majeti@hpe.com> Closes #383 from majetideepak/PARQUET-1002 and squashes the following commits: 5a93fe3 [Deepak Majeti] fix error 48dd22d [Deepak Majeti] change fix version from 1.2.1 to 1.3.0 712ea90 [Deepak Majeti] Move test to statistics-test 75ea475 [Deepak Majeti] Fix formatting c5d9610 [Deepak Majeti] Rename compare to comparator ba18ae6 [Deepak Majeti] Review comments f87bda6 [Deepak Majeti] Avoid UTF-8 file encoding mismatch between windows and linux 3cb8a3e [Deepak Majeti] Review comments 17a79f3 [Deepak Majeti] fix failures 85a9052 [Deepak Majeti] fix Clang failure and improve test Fix Warnings on Windows bdd8d37 [Deepak Majeti] Add another test 6a985de [Deepak Majeti] Comments 808e764 [Deepak Majeti] fix failure 55960fe [Deepak Majeti] format b4fba8b [Deepak Majeti] Add test for Unknown sort order d58658d [Deepak Majeti] make format c1abb69 [Deepak Majeti] rename to Make 3df2686 [Deepak Majeti] Add Reader Writer Statistics Test c4b1827 [Deepak Majeti] extend testing fix tests 4287565 [Deepak Majeti] Fix reader read new max and min values 046f8d3 [Deepak Majeti] Move SortOrder to types.h add int32 comparison INT96 fix statistics in metadata Use templates
1 parent 15f9839 commit cf6992a

17 files changed

+871
-170
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,7 @@ set(LIBPARQUET_SRCS
646646

647647
src/parquet/parquet_constants.cpp
648648
src/parquet/parquet_types.cpp
649+
src/parquet/util/comparison.cc
649650
src/parquet/util/memory.cc
650651
)
651652

src/parquet/column_writer-test.cc

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -151,14 +151,16 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
151151
void ReadAndCompare(Compression::type compression, int64_t num_rows) {
152152
this->SetupValuesOut(num_rows);
153153
this->ReadColumnFully(compression);
154-
Compare<T> compare(this->descr_);
154+
std::shared_ptr<CompareDefault<TestType>> compare;
155+
compare = std::static_pointer_cast<CompareDefault<TestType>>(
156+
Comparator::Make(this->descr_));
155157
for (size_t i = 0; i < this->values_.size(); i++) {
156-
if (compare(this->values_[i], this->values_out_[i]) ||
157-
compare(this->values_out_[i], this->values_[i])) {
158+
if ((*compare)(this->values_[i], this->values_out_[i]) ||
159+
(*compare)(this->values_out_[i], this->values_[i])) {
158160
std::cout << "Failed at " << i << std::endl;
159161
}
160-
ASSERT_FALSE(compare(this->values_[i], this->values_out_[i]));
161-
ASSERT_FALSE(compare(this->values_out_[i], this->values_[i]));
162+
ASSERT_FALSE((*compare)(this->values_[i], this->values_out_[i]));
163+
ASSERT_FALSE((*compare)(this->values_out_[i], this->values_[i]));
162164
}
163165
ASSERT_EQ(this->values_, this->values_out_);
164166
}

src/parquet/column_writer.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,10 @@ int64_t ColumnWriter::Close() {
267267
FlushBufferedDataPages();
268268

269269
EncodedStatistics chunk_statistics = GetChunkStatistics();
270-
if (chunk_statistics.is_set()) metadata_->SetStatistics(chunk_statistics);
270+
if (chunk_statistics.is_set()) {
271+
metadata_->SetStatistics(SortOrder::SIGNED == descr_->sort_order(),
272+
chunk_statistics);
273+
}
271274
pager_->Close(has_dictionary_, fallback_);
272275
}
273276

@@ -317,7 +320,8 @@ TypedColumnWriter<Type>::TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata,
317320
ParquetException::NYI("Selected encoding is not supported");
318321
}
319322

320-
if (properties->statistics_enabled(descr_->path())) {
323+
if (properties->statistics_enabled(descr_->path()) &&
324+
(SortOrder::UNKNOWN != descr_->sort_order())) {
321325
page_statistics_ = std::unique_ptr<TypedStats>(new TypedStats(descr_, allocator_));
322326
chunk_statistics_ = std::unique_ptr<TypedStats>(new TypedStats(descr_, allocator_));
323327
}

src/parquet/file/file-metadata-test.cc

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ TEST(Metadata, TestBuildAccess) {
6363
auto col1_builder = rg1_builder->NextColumnChunk();
6464
auto col2_builder = rg1_builder->NextColumnChunk();
6565
// column metadata
66-
col1_builder->SetStatistics(stats_int);
67-
col2_builder->SetStatistics(stats_float);
66+
col1_builder->SetStatistics(true, stats_int);
67+
col2_builder->SetStatistics(true, stats_float);
6868
col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false);
6969
col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false);
7070
rg1_builder->Finish(1024);
@@ -73,8 +73,8 @@ TEST(Metadata, TestBuildAccess) {
7373
col1_builder = rg2_builder->NextColumnChunk();
7474
col2_builder = rg2_builder->NextColumnChunk();
7575
// column metadata
76-
col1_builder->SetStatistics(stats_int);
77-
col2_builder->SetStatistics(stats_float);
76+
col1_builder->SetStatistics(true, stats_int);
77+
col2_builder->SetStatistics(true, stats_float);
7878
col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false);
7979
col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false);
8080
rg2_builder->Finish(1024);
@@ -215,11 +215,12 @@ TEST(ApplicationVersion, Basics) {
215215

216216
ASSERT_EQ(true, version.VersionLt(version1));
217217

218-
ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96));
219-
ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32));
220-
ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY));
221-
ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY));
222-
ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY));
218+
ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, SortOrder::SIGNED));
219+
ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, SortOrder::SIGNED));
220+
ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
221+
ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
222+
ASSERT_TRUE(
223+
version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED));
223224
}
224225

225226
} // namespace metadata

src/parquet/file/metadata.cc

Lines changed: 41 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -35,62 +35,20 @@ const ApplicationVersion ApplicationVersion::PARQUET_251_FIXED_VERSION =
3535
ApplicationVersion("parquet-mr version 1.8.0");
3636
const ApplicationVersion ApplicationVersion::PARQUET_816_FIXED_VERSION =
3737
ApplicationVersion("parquet-mr version 1.2.9");
38-
39-
// Return the Sort Order of the Parquet Physical Types
40-
SortOrder default_sort_order(Type::type primitive) {
41-
switch (primitive) {
42-
case Type::BOOLEAN:
43-
case Type::INT32:
44-
case Type::INT64:
45-
case Type::FLOAT:
46-
case Type::DOUBLE:
47-
return SortOrder::SIGNED;
48-
case Type::BYTE_ARRAY:
49-
case Type::FIXED_LEN_BYTE_ARRAY:
50-
case Type::INT96: // only used for timestamp, which uses unsigned values
51-
return SortOrder::UNSIGNED;
52-
}
53-
return SortOrder::UNKNOWN;
54-
}
55-
56-
// Return the SortOrder of the Parquet Types using Logical or Physical Types
57-
SortOrder get_sort_order(LogicalType::type converted, Type::type primitive) {
58-
if (converted == LogicalType::NONE) return default_sort_order(primitive);
59-
switch (converted) {
60-
case LogicalType::INT_8:
61-
case LogicalType::INT_16:
62-
case LogicalType::INT_32:
63-
case LogicalType::INT_64:
64-
case LogicalType::DATE:
65-
case LogicalType::TIME_MICROS:
66-
case LogicalType::TIME_MILLIS:
67-
case LogicalType::TIMESTAMP_MICROS:
68-
case LogicalType::TIMESTAMP_MILLIS:
69-
return SortOrder::SIGNED;
70-
case LogicalType::UINT_8:
71-
case LogicalType::UINT_16:
72-
case LogicalType::UINT_32:
73-
case LogicalType::UINT_64:
74-
case LogicalType::ENUM:
75-
case LogicalType::UTF8:
76-
case LogicalType::BSON:
77-
case LogicalType::JSON:
78-
return SortOrder::UNSIGNED;
79-
case LogicalType::NA:
80-
case LogicalType::DECIMAL:
81-
case LogicalType::LIST:
82-
case LogicalType::MAP:
83-
case LogicalType::MAP_KEY_VALUE:
84-
case LogicalType::INTERVAL:
85-
case LogicalType::NONE: // required instead of default
86-
return SortOrder::UNKNOWN;
87-
}
88-
return SortOrder::UNKNOWN;
89-
}
38+
const ApplicationVersion ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION =
39+
ApplicationVersion("parquet-cpp version 1.3.0");
9040

9141
template <typename DType>
9242
static std::shared_ptr<RowGroupStatistics> MakeTypedColumnStats(
9343
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
44+
// If new fields max_value/min_value are set, then return them.
45+
if (metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value) {
46+
return std::make_shared<TypedRowGroupStatistics<DType>>(
47+
descr, metadata.statistics.min_value, metadata.statistics.max_value,
48+
metadata.num_values - metadata.statistics.null_count,
49+
metadata.statistics.null_count, metadata.statistics.distinct_count, true);
50+
}
51+
// Default behavior
9452
return std::make_shared<TypedRowGroupStatistics<DType>>(
9553
descr, metadata.statistics.min, metadata.statistics.max,
9654
metadata.num_values - metadata.statistics.null_count,
@@ -159,9 +117,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
159117
inline bool is_stats_set() const {
160118
DCHECK(writer_version_ != nullptr);
161119
return column_->meta_data.__isset.statistics &&
162-
writer_version_->HasCorrectStatistics(type()) &&
163-
SortOrder::SIGNED ==
164-
get_sort_order(descr_->logical_type(), descr_->physical_type());
120+
writer_version_->HasCorrectStatistics(type(), descr_->sort_order());
165121
}
166122

167123
inline std::shared_ptr<RowGroupStatistics> statistics() const {
@@ -534,15 +490,21 @@ bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) cons
534490
// Reference:
535491
// parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
536492
// PARQUET-686 has more disussion on statistics
537-
bool ApplicationVersion::HasCorrectStatistics(Type::type col_type) const {
538-
// None of the current tools write INT96 Statistics correctly
539-
if (col_type == Type::INT96) return false;
540-
541-
// Statistics of other types are OK
542-
if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
543-
return true;
493+
bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
494+
SortOrder::type sort_order) const {
495+
// Parquet cpp version 1.3.0 onwards stats are computed correctly for all types
496+
if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION))) {
497+
// Only SIGNED are valid
498+
if (SortOrder::SIGNED != sort_order) return false;
499+
500+
// None of the current tools write INT96 Statistics correctly
501+
if (col_type == Type::INT96) return false;
502+
503+
// Statistics of other types are OK
504+
if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
505+
return true;
506+
}
544507
}
545-
546508
// created_by is not populated, which could have been caused by
547509
// parquet-mr during the same time as PARQUET-251, see PARQUET-297
548510
if (application_ == "unknown") {
@@ -577,16 +539,24 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
577539
void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); }
578540

579541
// column metadata
580-
void SetStatistics(const EncodedStatistics& val) {
542+
void SetStatistics(bool is_signed, const EncodedStatistics& val) {
581543
format::Statistics stats;
582544
stats.null_count = val.null_count;
583545
stats.distinct_count = val.distinct_count;
584-
stats.max = val.max();
585-
stats.min = val.min();
586-
stats.__isset.min = val.has_min;
587-
stats.__isset.max = val.has_max;
546+
stats.max_value = val.max();
547+
stats.min_value = val.min();
548+
stats.__isset.min_value = val.has_min;
549+
stats.__isset.max_value = val.has_max;
588550
stats.__isset.null_count = val.has_null_count;
589551
stats.__isset.distinct_count = val.has_distinct_count;
552+
// If the order is SIGNED, then the old min/max values must be set too.
553+
// This for backward compatibility
554+
if (is_signed) {
555+
stats.max = val.max();
556+
stats.min = val.min();
557+
stats.__isset.min = val.has_min;
558+
stats.__isset.max = val.has_max;
559+
}
590560

591561
column_chunk_->meta_data.__set_statistics(stats);
592562
}
@@ -674,8 +644,9 @@ const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const {
674644
return impl_->descr();
675645
}
676646

677-
void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) {
678-
impl_->SetStatistics(result);
647+
void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed,
648+
const EncodedStatistics& result) {
649+
impl_->SetStatistics(is_signed, result);
679650
}
680651

681652
class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {

src/parquet/file/metadata.h

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,12 @@ namespace parquet {
3535

3636
using KeyValueMetadata = ::arrow::KeyValueMetadata;
3737

38-
// Reference:
39-
// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
40-
// format/converter/ParquetMetadataConverter.java
41-
// Sort order for page and column statistics. Types are associated with sort
42-
// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
43-
// aggregated using a sort order. As of parquet-format version 2.3.1, the
44-
// order used to aggregate stats is always SIGNED and is not stored in the
45-
// Parquet file. These stats are discarded for types that need unsigned.
46-
// See PARQUET-686.
47-
enum SortOrder { SIGNED, UNSIGNED, UNKNOWN };
48-
4938
class ApplicationVersion {
5039
public:
5140
// Known Versions with Issues
5241
static const ApplicationVersion PARQUET_251_FIXED_VERSION;
5342
static const ApplicationVersion PARQUET_816_FIXED_VERSION;
43+
static const ApplicationVersion PARQUET_CPP_FIXED_STATS_VERSION;
5444
// Regular expression for the version format
5545
// major . minor . patch unknown - prerelease.x + build info
5646
// Eg: 1.5.0ab-cdh5.5.0+cd
@@ -92,7 +82,8 @@ class ApplicationVersion {
9282
bool VersionEq(const ApplicationVersion& other_version) const;
9383

9484
// Checks if the Version has the correct statistics for a given column
95-
bool HasCorrectStatistics(Type::type primitive) const;
85+
bool HasCorrectStatistics(Type::type primitive,
86+
SortOrder::type sort_order = SortOrder::SIGNED) const;
9687
};
9788

9889
class PARQUET_EXPORT ColumnChunkMetaData {
@@ -209,7 +200,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
209200
// Used when a dataset is spread across multiple files
210201
void set_file_path(const std::string& path);
211202
// column metadata
212-
void SetStatistics(const EncodedStatistics& stats);
203+
void SetStatistics(bool is_signed, const EncodedStatistics& stats);
213204
// get the column descriptor
214205
const ColumnDescriptor* descr() const;
215206
// commit the metadata

src/parquet/parquet_version.h.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@
2121
// define the parquet created by version
2222
#define CREATED_BY_VERSION "parquet-cpp version @PARQUET_VERSION@"
2323

24-
#endif // PARQUET_VERSION_H
24+
#endif // PARQUET_VERSION_H

src/parquet/schema.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,10 @@ class PARQUET_EXPORT ColumnDescriptor {
332332

333333
LogicalType::type logical_type() const { return primitive_node_->logical_type(); }
334334

335+
SortOrder::type sort_order() const {
336+
return GetSortOrder(logical_type(), physical_type());
337+
}
338+
335339
const std::string& name() const { return primitive_node_->name(); }
336340

337341
const std::shared_ptr<schema::ColumnPath> path() const;

0 commit comments

Comments
 (0)