From 046f8d3d39dcd76c931ecd30cec2e0d97d3a7edd Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Thu, 25 May 2017 15:43:00 -0400 Subject: [PATCH 01/21] Move SortOrder to types.h add int32 comparison INT96 fix statistics in metadata Use templates --- CMakeLists.txt | 1 + src/parquet/column_writer-test.cc | 11 +- src/parquet/column_writer.cc | 8 +- src/parquet/file/file-metadata-test.cc | 8 +- src/parquet/file/metadata.cc | 74 +++--------- src/parquet/file/metadata.h | 13 +- src/parquet/statistics.cc | 24 ++-- src/parquet/statistics.h | 2 + src/parquet/types.cc | 52 ++++++++ src/parquet/types.h | 17 +++ src/parquet/util/comparison-test.cc | 6 +- src/parquet/util/comparison.cc | 79 +++++++++++++ src/parquet/util/comparison.h | 158 +++++++++++++++++++++---- 13 files changed, 330 insertions(+), 123 deletions(-) create mode 100644 src/parquet/util/comparison.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index b7a41d87..1ecf8773 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -646,6 +646,7 @@ set(LIBPARQUET_SRCS src/parquet/parquet_constants.cpp src/parquet/parquet_types.cpp + src/parquet/util/comparison.cc src/parquet/util/memory.cc ) diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index 3ec36635..2c3288a3 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -151,14 +151,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { void ReadAndCompare(Compression::type compression, int64_t num_rows) { this->SetupValuesOut(num_rows); this->ReadColumnFully(compression); - Compare compare(this->descr_); + std::shared_ptr >compare; + compare = std::static_pointer_cast >(Compare::getComparator(this->descr_)); for (size_t i = 0; i < this->values_.size(); i++) { - if (compare(this->values_[i], this->values_out_[i]) || - compare(this->values_out_[i], this->values_[i])) { + if ((*compare)(this->values_[i], this->values_out_[i]) || + (*compare)(this->values_out_[i], this->values_[i])) { std::cout << "Failed at " << i << std::endl; } - ASSERT_FALSE(compare(this->values_[i], this->values_out_[i])); - ASSERT_FALSE(compare(this->values_out_[i], this->values_[i])); + ASSERT_FALSE((*compare)(this->values_[i], this->values_out_[i])); + ASSERT_FALSE((*compare)(this->values_out_[i], this->values_[i])); } ASSERT_EQ(this->values_, this->values_out_); } diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index b36f3958..f0e85391 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -267,7 +267,9 @@ int64_t ColumnWriter::Close() { FlushBufferedDataPages(); EncodedStatistics chunk_statistics = GetChunkStatistics(); - if (chunk_statistics.is_set()) metadata_->SetStatistics(chunk_statistics); + if (chunk_statistics.is_set()) metadata_->SetStatistics( + SortOrder::SIGNED == GetSortOrder(descr_->logical_type(), + descr_->physical_type()), chunk_statistics); pager_->Close(has_dictionary_, fallback_); } @@ -317,7 +319,9 @@ TypedColumnWriter::TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata, ParquetException::NYI("Selected encoding is not supported"); } - if (properties->statistics_enabled(descr_->path())) { + if (properties->statistics_enabled(descr_->path()) && + (SortOrder::UNKNOWN != GetSortOrder(descr_->logical_type(), + descr_->physical_type())) ) { page_statistics_ = std::unique_ptr(new TypedStats(descr_, allocator_)); chunk_statistics_ = std::unique_ptr(new TypedStats(descr_, allocator_)); } diff --git a/src/parquet/file/file-metadata-test.cc b/src/parquet/file/file-metadata-test.cc index a7c438c8..5291dbf8 100644 --- a/src/parquet/file/file-metadata-test.cc +++ b/src/parquet/file/file-metadata-test.cc @@ -63,8 +63,8 @@ TEST(Metadata, TestBuildAccess) { auto col1_builder = rg1_builder->NextColumnChunk(); auto col2_builder = rg1_builder->NextColumnChunk(); // column metadata - col1_builder->SetStatistics(stats_int); - col2_builder->SetStatistics(stats_float); + col1_builder->SetStatistics(true, stats_int); + col2_builder->SetStatistics(true, stats_float); col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); rg1_builder->Finish(1024); @@ -73,8 +73,8 @@ TEST(Metadata, TestBuildAccess) { col1_builder = rg2_builder->NextColumnChunk(); col2_builder = rg2_builder->NextColumnChunk(); // column metadata - col1_builder->SetStatistics(stats_int); - col2_builder->SetStatistics(stats_float); + col1_builder->SetStatistics(true, stats_int); + col2_builder->SetStatistics(true, stats_float); col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); rg2_builder->Finish(1024); diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc index d5a96f31..1aed6d60 100644 --- a/src/parquet/file/metadata.cc +++ b/src/parquet/file/metadata.cc @@ -36,58 +36,6 @@ const ApplicationVersion ApplicationVersion::PARQUET_251_FIXED_VERSION = const ApplicationVersion ApplicationVersion::PARQUET_816_FIXED_VERSION = ApplicationVersion("parquet-mr version 1.2.9"); -// Return the Sort Order of the Parquet Physical Types -SortOrder default_sort_order(Type::type primitive) { - switch (primitive) { - case Type::BOOLEAN: - case Type::INT32: - case Type::INT64: - case Type::FLOAT: - case Type::DOUBLE: - return SortOrder::SIGNED; - case Type::BYTE_ARRAY: - case Type::FIXED_LEN_BYTE_ARRAY: - case Type::INT96: // only used for timestamp, which uses unsigned values - return SortOrder::UNSIGNED; - } - return SortOrder::UNKNOWN; -} - -// Return the SortOrder of the Parquet Types using Logical or Physical Types -SortOrder get_sort_order(LogicalType::type converted, Type::type primitive) { - if (converted == LogicalType::NONE) return default_sort_order(primitive); - switch (converted) { - case LogicalType::INT_8: - case LogicalType::INT_16: - case LogicalType::INT_32: - case LogicalType::INT_64: - case LogicalType::DATE: - case LogicalType::TIME_MICROS: - case LogicalType::TIME_MILLIS: - case LogicalType::TIMESTAMP_MICROS: - case LogicalType::TIMESTAMP_MILLIS: - return SortOrder::SIGNED; - case LogicalType::UINT_8: - case LogicalType::UINT_16: - case LogicalType::UINT_32: - case LogicalType::UINT_64: - case LogicalType::ENUM: - case LogicalType::UTF8: - case LogicalType::BSON: - case LogicalType::JSON: - return SortOrder::UNSIGNED; - case LogicalType::NA: - case LogicalType::DECIMAL: - case LogicalType::LIST: - case LogicalType::MAP: - case LogicalType::MAP_KEY_VALUE: - case LogicalType::INTERVAL: - case LogicalType::NONE: // required instead of default - return SortOrder::UNKNOWN; - } - return SortOrder::UNKNOWN; -} - template static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { @@ -161,7 +109,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return column_->meta_data.__isset.statistics && writer_version_->HasCorrectStatistics(type()) && SortOrder::SIGNED == - get_sort_order(descr_->logical_type(), descr_->physical_type()); + GetSortOrder(descr_->logical_type(), descr_->physical_type()); } inline std::shared_ptr statistics() const { @@ -577,16 +525,22 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); } // column metadata - void SetStatistics(const EncodedStatistics& val) { + void SetStatistics(bool is_signed, const EncodedStatistics& val) { format::Statistics stats; stats.null_count = val.null_count; stats.distinct_count = val.distinct_count; - stats.max = val.max(); - stats.min = val.min(); - stats.__isset.min = val.has_min; - stats.__isset.max = val.has_max; + stats.max_value = val.max(); + stats.min_value = val.min(); + stats.__isset.min_value = val.has_min; + stats.__isset.max_value = val.has_max; stats.__isset.null_count = val.has_null_count; stats.__isset.distinct_count = val.has_distinct_count; + if (is_signed) { + stats.max = val.max(); + stats.min = val.min(); + stats.__isset.min = val.has_min; + stats.__isset.max = val.has_max; + } column_chunk_->meta_data.__set_statistics(stats); } @@ -674,8 +628,8 @@ const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { return impl_->descr(); } -void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) { - impl_->SetStatistics(result); +void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed, const EncodedStatistics& result) { + impl_->SetStatistics(is_signed, result); } class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h index 4250f6b5..986e9c4d 100644 --- a/src/parquet/file/metadata.h +++ b/src/parquet/file/metadata.h @@ -35,17 +35,6 @@ namespace parquet { using KeyValueMetadata = ::arrow::KeyValueMetadata; -// Reference: -// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/ -// format/converter/ParquetMetadataConverter.java -// Sort order for page and column statistics. Types are associated with sort -// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are -// aggregated using a sort order. As of parquet-format version 2.3.1, the -// order used to aggregate stats is always SIGNED and is not stored in the -// Parquet file. These stats are discarded for types that need unsigned. -// See PARQUET-686. -enum SortOrder { SIGNED, UNSIGNED, UNKNOWN }; - class ApplicationVersion { public: // Known Versions with Issues @@ -209,7 +198,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // Used when a dataset is spread across multiple files void set_file_path(const std::string& path); // column metadata - void SetStatistics(const EncodedStatistics& stats); + void SetStatistics(bool is_signed, const EncodedStatistics& stats); // get the column descriptor const ColumnDescriptor* descr() const; // commit the metadata diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc index 12d1f5bb..43bddaa8 100644 --- a/src/parquet/statistics.cc +++ b/src/parquet/statistics.cc @@ -21,7 +21,6 @@ #include "parquet/encoding-internal.h" #include "parquet/exception.h" #include "parquet/statistics.h" -#include "parquet/util/comparison.h" #include "parquet/util/memory.h" using arrow::default_memory_pool; @@ -35,6 +34,7 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const ColumnDescriptor* : pool_(pool), min_buffer_(AllocateBuffer(pool_, 0)), max_buffer_(AllocateBuffer(pool_, 0)) { + comparator_ = std::static_pointer_cast >(Compare::getComparator(schema)); SetDescr(schema); Reset(); } @@ -69,6 +69,7 @@ TypedRowGroupStatistics::TypedRowGroupStatistics( IncrementNullCount(null_count); IncrementDistinctCount(distinct_count); + comparator_ = std::static_pointer_cast >(Compare::getComparator(schema)); SetDescr(schema); if (!encoded_min.empty()) { @@ -102,15 +103,14 @@ void TypedRowGroupStatistics::Update(const T* values, int64_t num_not_nul // TODO: support distinct count? if (num_not_null == 0) return; - Compare compare(descr_); - auto batch_minmax = std::minmax_element(values, values + num_not_null, compare); + auto batch_minmax = std::minmax_element(values, values + num_not_null, std::ref(*(this->comparator_))); if (!has_min_max_) { has_min_max_ = true; Copy(*batch_minmax.first, &min_, min_buffer_.get()); Copy(*batch_minmax.second, &max_, max_buffer_.get()); } else { - Copy(std::min(min_, *batch_minmax.first, compare), &min_, min_buffer_.get()); - Copy(std::max(max_, *batch_minmax.second, compare), &max_, max_buffer_.get()); + Copy(std::min(min_, *batch_minmax.first, std::ref(*(this->comparator_))), &min_, min_buffer_.get()); + Copy(std::max(max_, *batch_minmax.second, std::ref(*(this->comparator_))), &max_, max_buffer_.get()); } } @@ -128,7 +128,6 @@ void TypedRowGroupStatistics::UpdateSpaced(const T* values, // TODO: support distinct count? if (num_not_null == 0) return; - Compare compare(descr_); INIT_BITSET(valid_bits, static_cast(valid_bits_offset)); // Find first valid entry and use that for min/max // As (num_not_null != 0) there must be one @@ -144,9 +143,9 @@ void TypedRowGroupStatistics::UpdateSpaced(const T* values, T max = values[i]; for (; i < length; i++) { if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { - if (compare(values[i], min)) { + if ((std::ref(*(this->comparator_)))(values[i], min)) { min = values[i]; - } else if (compare(max, values[i])) { + } else if ((std::ref(*(this->comparator_)))(max, values[i])) { max = values[i]; } } @@ -157,8 +156,8 @@ void TypedRowGroupStatistics::UpdateSpaced(const T* values, Copy(min, &min_, min_buffer_.get()); Copy(max, &max_, max_buffer_.get()); } else { - Copy(std::min(min_, min, compare), &min_, min_buffer_.get()); - Copy(std::max(max_, max, compare), &max_, max_buffer_.get()); + Copy(std::min(min_, min, std::ref(*(this->comparator_))), &min_, min_buffer_.get()); + Copy(std::max(max_, max, std::ref(*(this->comparator_))), &max_, max_buffer_.get()); } } @@ -185,9 +184,8 @@ void TypedRowGroupStatistics::Merge(const TypedRowGroupStatistics& return; } - Compare compare(descr_); - Copy(std::min(this->min_, other.min_, compare), &this->min_, min_buffer_.get()); - Copy(std::max(this->max_, other.max_, compare), &this->max_, max_buffer_.get()); + Copy(std::min(this->min_, other.min_, std::ref(*(this->comparator_))), &this->min_, min_buffer_.get()); + Copy(std::max(this->max_, other.max_, std::ref(*(this->comparator_))), &this->max_, max_buffer_.get()); } template diff --git a/src/parquet/statistics.h b/src/parquet/statistics.h index 12d05557..13ebfd3e 100644 --- a/src/parquet/statistics.h +++ b/src/parquet/statistics.h @@ -24,6 +24,7 @@ #include "parquet/schema.h" #include "parquet/types.h" +#include "parquet/util/comparison.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -164,6 +165,7 @@ class TypedRowGroupStatistics : public RowGroupStatistics { T min_; T max_; ::arrow::MemoryPool* pool_; + std::shared_ptr > comparator_; void PlainEncode(const T& src, std::string* dst); void PlainDecode(const std::string& src, T* dst); diff --git a/src/parquet/types.cc b/src/parquet/types.cc index 97c769be..515026cd 100644 --- a/src/parquet/types.cc +++ b/src/parquet/types.cc @@ -221,4 +221,56 @@ int GetTypeByteSize(Type::type parquet_type) { return 0; } +// Return the Sort Order of the Parquet Physical Types +SortOrder::type DefaultSortOrder(Type::type primitive) { + switch (primitive) { + case Type::BOOLEAN: + case Type::INT32: + case Type::INT64: + case Type::FLOAT: + case Type::DOUBLE: + return SortOrder::SIGNED; + case Type::BYTE_ARRAY: + case Type::FIXED_LEN_BYTE_ARRAY: + case Type::INT96: // only used for timestamp, which uses unsigned values + return SortOrder::UNSIGNED; + } + return SortOrder::UNKNOWN; +} + +// Return the SortOrder of the Parquet Types using Logical or Physical Types +SortOrder::type GetSortOrder(LogicalType::type converted, Type::type primitive) { + if (converted == LogicalType::NONE) return DefaultSortOrder(primitive); + switch (converted) { + case LogicalType::INT_8: + case LogicalType::INT_16: + case LogicalType::INT_32: + case LogicalType::INT_64: + case LogicalType::DATE: + case LogicalType::TIME_MICROS: + case LogicalType::TIME_MILLIS: + case LogicalType::TIMESTAMP_MICROS: + case LogicalType::TIMESTAMP_MILLIS: + return SortOrder::SIGNED; + case LogicalType::UINT_8: + case LogicalType::UINT_16: + case LogicalType::UINT_32: + case LogicalType::UINT_64: + case LogicalType::ENUM: + case LogicalType::UTF8: + case LogicalType::BSON: + case LogicalType::JSON: + return SortOrder::UNSIGNED; + case LogicalType::DECIMAL: + case LogicalType::LIST: + case LogicalType::MAP: + case LogicalType::MAP_KEY_VALUE: + case LogicalType::INTERVAL: + case LogicalType::NONE: // required instead of default + case LogicalType::NA: // required instead of default + return SortOrder::UNKNOWN; + } + return SortOrder::UNKNOWN; +} + } // namespace parquet diff --git a/src/parquet/types.h b/src/parquet/types.h index 38015c4d..9a397163 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -116,6 +116,19 @@ struct PageType { enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; }; +// Reference: +// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/ +// format/converter/ParquetMetadataConverter.java +// Sort order for page and column statistics. Types are associated with sort +// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are +// aggregated using a sort order. As of parquet-format version 2.3.1, the +// order used to aggregate stats is always SIGNED and is not stored in the +// Parquet file. These stats are discarded for types that need unsigned. +// See PARQUET-686. +struct SortOrder { + enum type { SIGNED, UNSIGNED, UNKNOWN }; +}; + // ---------------------------------------------------------------------- struct ByteArray { @@ -283,6 +296,10 @@ PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* PARQUET_EXPORT int GetTypeByteSize(Type::type t); +SortOrder::type PARQUET_EXPORT DefaultSortOrder(Type::type primitive); + +SortOrder::type PARQUET_EXPORT GetSortOrder(LogicalType::type converted, Type::type primitive); + } // namespace parquet #endif // PARQUET_TYPES_H diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index 84019837..ac7ab141 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -45,7 +45,7 @@ static FLBA FLBAFromString(const std::string& s) { TEST(Comparison, ByteArray) { NodePtr node = PrimitiveNode::Make("bytearray", Repetition::REQUIRED, Type::BYTE_ARRAY); ColumnDescriptor descr(node, 0, 0); - Compare less(&descr); + CompareDefault less; std::string a = "arrange"; std::string b = "arrangement"; @@ -71,7 +71,7 @@ TEST(Comparison, FLBA) { PrimitiveNode::Make("FLBA", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, static_cast(a.size())); ColumnDescriptor descr(node, 0, 0); - Compare less(&descr); + CompareDefault less(descr.type_length()); ASSERT_TRUE(less(arr1, arr2)); } @@ -80,7 +80,7 @@ TEST(Comparison, Int96) { NodePtr node = PrimitiveNode::Make("int96", Repetition::REQUIRED, Type::INT96); ColumnDescriptor descr(node, 0, 0); - Compare less(&descr); + CompareDefault less; ASSERT_TRUE(less(a, b)); b.value[2] = 14; ASSERT_TRUE(!less(a, b) && !less(b, a)); diff --git a/src/parquet/util/comparison.cc b/src/parquet/util/comparison.cc new file mode 100644 index 00000000..5412d18e --- /dev/null +++ b/src/parquet/util/comparison.cc @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "parquet/util/comparison.h" + +namespace parquet { + + std::shared_ptr Compare::getComparator(const ColumnDescriptor* descr){ + if (SortOrder::SIGNED == GetSortOrder(descr->logical_type(), descr->physical_type())) { + switch (descr->physical_type()) { + case Type::BOOLEAN: + return std::make_shared(); + case Type::INT32: + return std::make_shared(); + case Type::INT64: + return std::make_shared(); + case Type::INT96: + return std::make_shared(); + case Type::FLOAT: + return std::make_shared(); + case Type::DOUBLE: + return std::make_shared(); + case Type::BYTE_ARRAY: + return std::make_shared(); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_shared(descr->type_length()); + default: + ParquetException::NYI("Signed Compare not implemented"); + } + } else if (SortOrder::UNSIGNED == GetSortOrder(descr->logical_type(), descr->physical_type())) { + switch (descr->physical_type()) { + case Type::INT32: + return std::make_shared(); + case Type::INT64: + return std::make_shared(); + case Type::INT96: + return std::make_shared(); + case Type::BYTE_ARRAY: + return std::make_shared(); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_shared(descr->type_length()); + default: + ParquetException::NYI("UnSigned Compare not implemented"); + } + } else { + throw ParquetException("UNKNOWN Sort Order"); + } + return nullptr; + } + +template class PARQUET_TEMPLATE_EXPORT CompareDefault; +template class PARQUET_TEMPLATE_EXPORT CompareDefault; +template class PARQUET_TEMPLATE_EXPORT CompareDefault; +template class PARQUET_TEMPLATE_EXPORT CompareDefault; +template class PARQUET_TEMPLATE_EXPORT CompareDefault; +template class PARQUET_TEMPLATE_EXPORT CompareDefault; +template class PARQUET_TEMPLATE_EXPORT CompareDefault; +template class PARQUET_TEMPLATE_EXPORT CompareDefault; + +} // namespace parquet diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h index edd3df13..b7424f57 100644 --- a/src/parquet/util/comparison.h +++ b/src/parquet/util/comparison.h @@ -20,40 +20,150 @@ #include +#include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" namespace parquet { -template -struct Compare { - explicit Compare(const ColumnDescriptor* descr) : type_length_(descr->type_length()) {} +class PARQUET_EXPORT Compare { + public: + virtual ~Compare() {} + static std::shared_ptr getComparator(const ColumnDescriptor* descr); +}; + +// The default comparison is SIGNED +template +class PARQUET_EXPORT CompareDefault : public Compare { + public: + typedef typename DType::c_type T; + CompareDefault() {} + virtual ~CompareDefault() {} + virtual bool operator()(const T& a, const T& b) { + return a < b; + } +}; + +template<> +class PARQUET_EXPORT CompareDefault : public Compare { + public: + CompareDefault() {} + virtual ~CompareDefault() {} + virtual bool operator()(const Int96& a, const Int96& b) { + const int32_t* aptr = reinterpret_cast(&a.value[0]); + const int32_t* bptr = reinterpret_cast(&b.value[0]); + return std::lexicographical_compare(aptr, aptr + 3, bptr, bptr + 3); + } +}; - inline bool operator()(const T& a, const T& b) { return a < b; } +template<> +class PARQUET_EXPORT CompareDefault : public Compare { + public: + CompareDefault() {} + virtual ~CompareDefault() {} + virtual bool operator()(const ByteArray& a, const ByteArray& b) { + const int8_t* aptr = reinterpret_cast(a.ptr); + const int8_t* bptr = reinterpret_cast(b.ptr); + return std::lexicographical_compare( + aptr, aptr + a.len, bptr, bptr + b.len); + } +}; - private: +template<> +class PARQUET_EXPORT CompareDefault : public Compare { + public: + explicit CompareDefault(int length) : type_length_(length) {} + virtual ~CompareDefault() {} + virtual bool operator()(const FLBA& a, const FLBA& b) { + const int8_t* aptr = reinterpret_cast(a.ptr); + const int8_t* bptr = reinterpret_cast(b.ptr); + return std::lexicographical_compare( + aptr, aptr + type_length_, bptr, bptr + type_length_); + } int32_t type_length_; }; -template <> -inline bool Compare::operator()(const Int96& a, const Int96& b) { - return std::lexicographical_compare(a.value, a.value + 3, b.value, b.value + 3); -} - -template <> -inline bool Compare::operator()(const ByteArray& a, const ByteArray& b) { - auto aptr = reinterpret_cast(a.ptr); - auto bptr = reinterpret_cast(b.ptr); - return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len); -} - -template <> -inline bool Compare::operator()(const FLBA& a, const FLBA& b) { - auto aptr = reinterpret_cast(a.ptr); - auto bptr = reinterpret_cast(b.ptr); - return std::lexicographical_compare(aptr, aptr + type_length_, bptr, - bptr + type_length_); -} +typedef CompareDefault CompareDefaultBoolean; +typedef CompareDefault CompareDefaultInt32; +typedef CompareDefault CompareDefaultInt64; +typedef CompareDefault CompareDefaultInt96; +typedef CompareDefault CompareDefaultFloat; +typedef CompareDefault CompareDefaultDouble; +typedef CompareDefault CompareDefaultByteArray; +typedef CompareDefault CompareDefaultFLBA; + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + +// Define UnSigned Comparators +class PARQUET_EXPORT CompareUnSignedInt32 : public CompareDefaultInt32 { + public: + virtual ~CompareUnSignedInt32() {} + bool operator()(const int32_t& a, const int32_t& b) override { + const uint32_t ua = a; + const uint32_t ub = b; + return (ua < ub); + } +}; + +class PARQUET_EXPORT CompareUnSignedInt64 : public CompareDefaultInt64 { + public: + virtual ~CompareUnSignedInt64() {} + bool operator()(const int64_t& a, const int64_t& b) override { + const uint64_t ua = a; + const uint64_t ub = b; + return (ua < ub); + } +}; + + +class PARQUET_EXPORT CompareUnSignedInt96 : public CompareDefaultInt96 { + public: + virtual ~CompareUnSignedInt96() {} + bool operator()(const Int96& a, const Int96& b) override { + const uint32_t* aptr = reinterpret_cast(&a.value[0]); + const uint32_t* bptr = reinterpret_cast(&b.value[0]); + return std::lexicographical_compare(aptr, aptr + 3, bptr, bptr + 3); + } +}; + +class PARQUET_EXPORT CompareUnSignedByteArray : public CompareDefaultByteArray { + public: + virtual ~CompareUnSignedByteArray() {} + bool operator()(const ByteArray& a, const ByteArray& b) override { + const uint8_t* aptr = reinterpret_cast(a.ptr); + const uint8_t* bptr = reinterpret_cast(b.ptr); + return std::lexicographical_compare( + aptr, aptr + a.len, bptr, bptr + b.len); + } +}; + +class PARQUET_EXPORT CompareUnSignedFLBA : public CompareDefaultFLBA { + public: + explicit CompareUnSignedFLBA(int length) : CompareDefaultFLBA(length) {} + virtual ~CompareUnSignedFLBA() {} + bool operator()(const FLBA& a, const FLBA& b) override { + const uint8_t* aptr = reinterpret_cast(a.ptr); + const uint8_t* bptr = reinterpret_cast(b.ptr); + return std::lexicographical_compare( + aptr, aptr + type_length_, bptr, bptr + type_length_); + } +}; } // namespace parquet From 42875654dc53063f93a60b74bc9f98b3e7e57c59 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 8 Aug 2017 10:08:47 -0400 Subject: [PATCH 02/21] Fix reader read new max and min values --- src/parquet/file/metadata.cc | 35 ++++++++++++++++++++++++----------- src/parquet/file/metadata.h | 3 ++- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc index 1aed6d60..a31a361b 100644 --- a/src/parquet/file/metadata.cc +++ b/src/parquet/file/metadata.cc @@ -35,10 +35,19 @@ const ApplicationVersion ApplicationVersion::PARQUET_251_FIXED_VERSION = ApplicationVersion("parquet-mr version 1.8.0"); const ApplicationVersion ApplicationVersion::PARQUET_816_FIXED_VERSION = ApplicationVersion("parquet-mr version 1.2.9"); +const ApplicationVersion ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION = + ApplicationVersion("parquet-cpp version 1.2.1"); template static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { + if (metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value) { + return std::make_shared>( + descr, metadata.statistics.min_value, metadata.statistics.max_value, + metadata.num_values - metadata.statistics.null_count, + metadata.statistics.null_count, metadata.statistics.distinct_count, + true); + } return std::make_shared>( descr, metadata.statistics.min, metadata.statistics.max, metadata.num_values - metadata.statistics.null_count, @@ -107,9 +116,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline bool is_stats_set() const { DCHECK(writer_version_ != nullptr); return column_->meta_data.__isset.statistics && - writer_version_->HasCorrectStatistics(type()) && - SortOrder::SIGNED == - GetSortOrder(descr_->logical_type(), descr_->physical_type()); + writer_version_->HasCorrectStatistics(type(), + GetSortOrder(descr_->logical_type(), descr_->physical_type())); } inline std::shared_ptr statistics() const { @@ -482,15 +490,20 @@ bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) cons // Reference: // parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java // PARQUET-686 has more disussion on statistics -bool ApplicationVersion::HasCorrectStatistics(Type::type col_type) const { - // None of the current tools write INT96 Statistics correctly - if (col_type == Type::INT96) return false; - - // Statistics of other types are OK - if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) { - return true; +bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, SortOrder::type sort_order) const { + // Parquet cpp version 1.2.1 onwards stats are computed correctly for all types + if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION))) { + // Only SIGNED are valid + if (SortOrder::SIGNED != sort_order) return false; + + // None of the current tools write INT96 Statistics correctly + if (col_type == Type::INT96) return false; + + // Statistics of other types are OK + if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) { + return true; + } } - // created_by is not populated, which could have been caused by // parquet-mr during the same time as PARQUET-251, see PARQUET-297 if (application_ == "unknown") { diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h index 986e9c4d..0844e467 100644 --- a/src/parquet/file/metadata.h +++ b/src/parquet/file/metadata.h @@ -40,6 +40,7 @@ class ApplicationVersion { // Known Versions with Issues static const ApplicationVersion PARQUET_251_FIXED_VERSION; static const ApplicationVersion PARQUET_816_FIXED_VERSION; + static const ApplicationVersion PARQUET_CPP_FIXED_STATS_VERSION; // Regular expression for the version format // major . minor . patch unknown - prerelease.x + build info // Eg: 1.5.0ab-cdh5.5.0+cd @@ -81,7 +82,7 @@ class ApplicationVersion { bool VersionEq(const ApplicationVersion& other_version) const; // Checks if the Version has the correct statistics for a given column - bool HasCorrectStatistics(Type::type primitive) const; + bool HasCorrectStatistics(Type::type primitive, SortOrder::type sort_order) const; }; class PARQUET_EXPORT ColumnChunkMetaData { From c4b18277a98c16196406c666ac67241874019c0f Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 8 Aug 2017 11:24:22 -0400 Subject: [PATCH 03/21] extend testing fix tests --- src/parquet/file/file-metadata-test.cc | 10 +- src/parquet/statistics-test.cc | 45 +++++++ src/parquet/util/comparison-test.cc | 173 ++++++++++++++++++++----- src/parquet/util/comparison.cc | 12 +- src/parquet/util/comparison.h | 24 ++-- 5 files changed, 211 insertions(+), 53 deletions(-) diff --git a/src/parquet/file/file-metadata-test.cc b/src/parquet/file/file-metadata-test.cc index 5291dbf8..318b282a 100644 --- a/src/parquet/file/file-metadata-test.cc +++ b/src/parquet/file/file-metadata-test.cc @@ -215,11 +215,11 @@ TEST(ApplicationVersion, Basics) { ASSERT_EQ(true, version.VersionLt(version1)); - ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96)); - ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32)); - ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY)); - ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY)); - ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY)); + ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, SortOrder::SIGNED)); + ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, SortOrder::SIGNED)); + ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED)); + ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED)); + ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED)); } } // namespace metadata diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc index 26352c1d..e36a33f1 100644 --- a/src/parquet/statistics-test.cc +++ b/src/parquet/statistics-test.cc @@ -356,5 +356,50 @@ TEST(CorruptStatistics, Basics) { ASSERT_FALSE(column_chunk6->is_stats_set()); } +TEST(CorrectStatistics, Basics) { + ApplicationVersion version("parquet-cpp version 1.2.1"); + SchemaDescriptor schema; + schema::NodePtr node; + std::vector fields; + // Test Physical Types + fields.push_back(schema::PrimitiveNode::Make("col1", Repetition::OPTIONAL, Type::INT32, + LogicalType::NONE)); + fields.push_back(schema::PrimitiveNode::Make("col2", Repetition::OPTIONAL, + Type::BYTE_ARRAY, LogicalType::NONE)); + // Test Logical Types + fields.push_back(schema::PrimitiveNode::Make("col3", Repetition::OPTIONAL, Type::INT32, + LogicalType::DATE)); + fields.push_back(schema::PrimitiveNode::Make("col4", Repetition::OPTIONAL, Type::INT32, + LogicalType::UINT_32)); + fields.push_back(schema::PrimitiveNode::Make("col5", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::INTERVAL, 12)); + fields.push_back(schema::PrimitiveNode::Make("col6", Repetition::OPTIONAL, + Type::BYTE_ARRAY, LogicalType::UTF8)); + node = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields); + schema.Init(node); + + format::ColumnChunk col_chunk; + col_chunk.meta_data.__isset.statistics = true; + auto column_chunk1 = ColumnChunkMetaData::Make( + reinterpret_cast(&col_chunk), schema.Column(0), &version); + ASSERT_TRUE(column_chunk1->is_stats_set()); + auto column_chunk2 = ColumnChunkMetaData::Make( + reinterpret_cast(&col_chunk), schema.Column(1), &version); + ASSERT_TRUE(column_chunk2->is_stats_set()); + auto column_chunk3 = ColumnChunkMetaData::Make( + reinterpret_cast(&col_chunk), schema.Column(2), &version); + ASSERT_TRUE(column_chunk3->is_stats_set()); + auto column_chunk4 = ColumnChunkMetaData::Make( + reinterpret_cast(&col_chunk), schema.Column(3), &version); + ASSERT_TRUE(column_chunk4->is_stats_set()); + auto column_chunk5 = ColumnChunkMetaData::Make( + reinterpret_cast(&col_chunk), schema.Column(4), &version); + ASSERT_TRUE(column_chunk5->is_stats_set()); + auto column_chunk6 = ColumnChunkMetaData::Make( + reinterpret_cast(&col_chunk), schema.Column(5), &version); + ASSERT_TRUE(column_chunk6->is_stats_set()); +} + } // namespace test } // namespace parquet diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index ac7ab141..c0ebc088 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -42,50 +42,163 @@ static FLBA FLBAFromString(const std::string& s) { return FLBA(ptr); } -TEST(Comparison, ByteArray) { - NodePtr node = PrimitiveNode::Make("bytearray", Repetition::REQUIRED, Type::BYTE_ARRAY); +TEST(Comparison, signedByteArray) { + NodePtr node = PrimitiveNode::Make("SignedByteArray", Repetition::REQUIRED, Type::BYTE_ARRAY); ColumnDescriptor descr(node, 0, 0); - CompareDefault less; - - std::string a = "arrange"; - std::string b = "arrangement"; - auto arr1 = ByteArrayFromString(a); - auto arr2 = ByteArrayFromString(b); - ASSERT_TRUE(less(arr1, arr2)); - - a = u8"braten"; - b = u8"bügeln"; - auto arr3 = ByteArrayFromString(a); - auto arr4 = ByteArrayFromString(b); - // see PARQUET-686 discussion about binary comparison - ASSERT_TRUE(!less(arr3, arr4)); + + CompareDefaultByteArray less; + + std::string s1 = "12345"; + std::string s2 = "12345678"; + ByteArray s1ba = ByteArrayFromString(s1); + ByteArray s2ba = ByteArrayFromString(s2); + ASSERT_TRUE(less(s1ba, s2ba)); + + // This is case where signed comparision UTF-8 (PARQUET-686) is incorrect + // This example is to only check signed comparison and not UTF-8. + s1 = u8"bügeln"; + s2 = u8"braten"; + s1ba = ByteArrayFromString(s1); + s2ba = ByteArrayFromString(s2); + ASSERT_TRUE(less(s1ba, s2ba)); } -TEST(Comparison, FLBA) { - std::string a = "Antidisestablishmentarianism"; - std::string b = "Bundesgesundheitsministerium"; - auto arr1 = FLBAFromString(a); - auto arr2 = FLBAFromString(b); +TEST(Comparison, UnsignedByteArray) { + NodePtr node = PrimitiveNode::Make("UnsignedByteArray", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::UTF8); + ColumnDescriptor descr(node, 0, 0); + + // Check if UTF-8 is compared using unsigned correctly + CompareUnsignedByteArray uless; + + std::string s1 = "arrange"; + std::string s2 = "arrangement"; + ByteArray s1ba = ByteArrayFromString(s1); + ByteArray s2ba = ByteArrayFromString(s2); + ASSERT_TRUE(uless(s1ba, s2ba)); + + s1 = u8"braten"; + s2 = u8"bügeln"; + s1ba = ByteArrayFromString(s1); + s2ba = ByteArrayFromString(s2); + ASSERT_TRUE(uless(s1ba, s2ba)); +} + +TEST(Comparison, SignedFLBA) { + int size = 10; + NodePtr node = + PrimitiveNode::Make("SignedFLBA", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, size); + ColumnDescriptor descr(node, 0, 0); + + CompareDefaultFLBA less(descr.type_length()); + + std::string s1 = "Anti123456"; + std::string s2 = "Bunkd123456"; + FLBA s1flba = FLBAFromString(s1); + FLBA s2flba = FLBAFromString(s2); + ASSERT_TRUE(less(s1flba, s2flba)); + s1 = "Bünk123456"; + s2 = "Bunk123456"; + s1flba = FLBAFromString(s1); + s2flba = FLBAFromString(s2); + ASSERT_TRUE(less(s1flba, s2flba)); +} + +TEST(Comparison, UnsignedFLBA) { + int size = 10; NodePtr node = - PrimitiveNode::Make("FLBA", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, static_cast(a.size())); + PrimitiveNode::Make("UnsignedFLBA", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, size); ColumnDescriptor descr(node, 0, 0); - CompareDefault less(descr.type_length()); - ASSERT_TRUE(less(arr1, arr2)); + + CompareUnsignedFLBA uless(descr.type_length()); + + std::string s1 = "Anti123456"; + std::string s2 = "Bunkd123456"; + FLBA s1flba = FLBAFromString(s1); + FLBA s2flba = FLBAFromString(s2); + ASSERT_TRUE(uless(s1flba, s2flba)); + + s1 = "Bunk123456"; + s2 = "Bünk123456"; + s1flba = FLBAFromString(s1); + s2flba = FLBAFromString(s2); + ASSERT_TRUE(uless(s1flba, s2flba)); } -TEST(Comparison, Int96) { +TEST(Comparison, SignedInt96) { parquet::Int96 a{{1, 41, 14}}, b{{1, 41, 42}}; + parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, 14}}; + parquet::Int96 aaa{{static_cast(-1), 41, 14}}, bbb{{1, 41, 42}}; + + NodePtr node = PrimitiveNode::Make("SignedInt96", Repetition::REQUIRED, Type::INT96); + ColumnDescriptor descr(node, 0, 0); + + CompareDefaultInt96 less; + + ASSERT_TRUE(less(a, b)); + ASSERT_TRUE(!less(aa, bb) && !less(bb, aa)); + ASSERT_TRUE(less(aaa, bbb)); +} - NodePtr node = PrimitiveNode::Make("int96", Repetition::REQUIRED, Type::INT96); +TEST(Comparison, UnsignedInt96) { + parquet::Int96 a{{1, 41, 14}}, b{{1, static_cast(-41), 42}}; + parquet::Int96 aa{{1, 41, 14}}, bb{{static_cast(-1), 41, 14}}; + + NodePtr node = PrimitiveNode::Make("UnsignedInt96", Repetition::REQUIRED, Type::INT96); + ColumnDescriptor descr(node, 0, 0); + + CompareUnsignedInt96 uless; + + ASSERT_TRUE(uless(a, b)); + ASSERT_TRUE(uless(aa, bb)); +} + +TEST(Comparison, SignedInt64) { + int64_t a = 1, b = 4; + int64_t aa = 1, bb = 1; + int64_t aaa = -1, bbb = 1; + + NodePtr node = PrimitiveNode::Make("SignedInt64", Repetition::REQUIRED, Type::INT64); + ColumnDescriptor descr(node, 0, 0); + + CompareDefaultInt64 less; + + ASSERT_TRUE(less(a, b)); + ASSERT_TRUE(!less(aa, bb) && !less(bb, aa)); + ASSERT_TRUE(less(aaa, bbb)); +} + +TEST(Comparison, UnsignedInt64) { + uint64_t a = 1, b = 4; + uint64_t aa = 1, bb = 1; + uint64_t aaa = 1, bbb = -1; + + NodePtr node = PrimitiveNode::Make("UnsignedInt64", Repetition::REQUIRED, Type::INT64); ColumnDescriptor descr(node, 0, 0); - CompareDefault less; + + CompareUnsignedInt64 less; + ASSERT_TRUE(less(a, b)); - b.value[2] = 14; - ASSERT_TRUE(!less(a, b) && !less(b, a)); + ASSERT_TRUE(!less(aa, bb) && !less(bb, aa)); + ASSERT_TRUE(less(aaa, bbb)); } +TEST(Comparison, UnsignedInt32) { + uint32_t a = 1, b = 4; + uint32_t aa = 1, bb = 1; + uint32_t aaa = 1, bbb = -1; + + NodePtr node = PrimitiveNode::Make("UnsignedInt32", Repetition::REQUIRED, Type::INT32); + ColumnDescriptor descr(node, 0, 0); + + CompareUnsignedInt32 less; + + ASSERT_TRUE(less(a, b)); + ASSERT_TRUE(!less(aa, bb) && !less(bb, aa)); + ASSERT_TRUE(less(aaa, bbb)); +} } // namespace test } // namespace parquet diff --git a/src/parquet/util/comparison.cc b/src/parquet/util/comparison.cc index 5412d18e..164b101b 100644 --- a/src/parquet/util/comparison.cc +++ b/src/parquet/util/comparison.cc @@ -49,17 +49,17 @@ namespace parquet { } else if (SortOrder::UNSIGNED == GetSortOrder(descr->logical_type(), descr->physical_type())) { switch (descr->physical_type()) { case Type::INT32: - return std::make_shared(); + return std::make_shared(); case Type::INT64: - return std::make_shared(); + return std::make_shared(); case Type::INT96: - return std::make_shared(); + return std::make_shared(); case Type::BYTE_ARRAY: - return std::make_shared(); + return std::make_shared(); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared(descr->type_length()); + return std::make_shared(descr->type_length()); default: - ParquetException::NYI("UnSigned Compare not implemented"); + ParquetException::NYI("Unsigned Compare not implemented"); } } else { throw ParquetException("UNKNOWN Sort Order"); diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h index b7424f57..17016da2 100644 --- a/src/parquet/util/comparison.h +++ b/src/parquet/util/comparison.h @@ -110,10 +110,10 @@ PARQUET_EXTERN_TEMPLATE CompareDefault; #pragma GCC diagnostic pop #endif -// Define UnSigned Comparators -class PARQUET_EXPORT CompareUnSignedInt32 : public CompareDefaultInt32 { +// Define Unsigned Comparators +class PARQUET_EXPORT CompareUnsignedInt32 : public CompareDefaultInt32 { public: - virtual ~CompareUnSignedInt32() {} + virtual ~CompareUnsignedInt32() {} bool operator()(const int32_t& a, const int32_t& b) override { const uint32_t ua = a; const uint32_t ub = b; @@ -121,9 +121,9 @@ class PARQUET_EXPORT CompareUnSignedInt32 : public CompareDefaultInt32 { } }; -class PARQUET_EXPORT CompareUnSignedInt64 : public CompareDefaultInt64 { +class PARQUET_EXPORT CompareUnsignedInt64 : public CompareDefaultInt64 { public: - virtual ~CompareUnSignedInt64() {} + virtual ~CompareUnsignedInt64() {} bool operator()(const int64_t& a, const int64_t& b) override { const uint64_t ua = a; const uint64_t ub = b; @@ -132,9 +132,9 @@ class PARQUET_EXPORT CompareUnSignedInt64 : public CompareDefaultInt64 { }; -class PARQUET_EXPORT CompareUnSignedInt96 : public CompareDefaultInt96 { +class PARQUET_EXPORT CompareUnsignedInt96 : public CompareDefaultInt96 { public: - virtual ~CompareUnSignedInt96() {} + virtual ~CompareUnsignedInt96() {} bool operator()(const Int96& a, const Int96& b) override { const uint32_t* aptr = reinterpret_cast(&a.value[0]); const uint32_t* bptr = reinterpret_cast(&b.value[0]); @@ -142,9 +142,9 @@ class PARQUET_EXPORT CompareUnSignedInt96 : public CompareDefaultInt96 { } }; -class PARQUET_EXPORT CompareUnSignedByteArray : public CompareDefaultByteArray { +class PARQUET_EXPORT CompareUnsignedByteArray : public CompareDefaultByteArray { public: - virtual ~CompareUnSignedByteArray() {} + virtual ~CompareUnsignedByteArray() {} bool operator()(const ByteArray& a, const ByteArray& b) override { const uint8_t* aptr = reinterpret_cast(a.ptr); const uint8_t* bptr = reinterpret_cast(b.ptr); @@ -153,10 +153,10 @@ class PARQUET_EXPORT CompareUnSignedByteArray : public CompareDefaultByteArray { } }; -class PARQUET_EXPORT CompareUnSignedFLBA : public CompareDefaultFLBA { +class PARQUET_EXPORT CompareUnsignedFLBA : public CompareDefaultFLBA { public: - explicit CompareUnSignedFLBA(int length) : CompareDefaultFLBA(length) {} - virtual ~CompareUnSignedFLBA() {} + explicit CompareUnsignedFLBA(int length) : CompareDefaultFLBA(length) {} + virtual ~CompareUnsignedFLBA() {} bool operator()(const FLBA& a, const FLBA& b) override { const uint8_t* aptr = reinterpret_cast(a.ptr); const uint8_t* bptr = reinterpret_cast(b.ptr); From 3df2686a00cdcfe179979a3e107c042505f3611f Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 8 Aug 2017 18:45:04 -0400 Subject: [PATCH 04/21] Add Reader Writer Statistics Test --- src/parquet/CMakeLists.txt | 1 + src/parquet/parquet_reader_writer-test.cc | 287 ++++++++++++++++++++++ 2 files changed, 288 insertions(+) create mode 100644 src/parquet/parquet_reader_writer-test.cc diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt index a2e283e2..d10759bc 100644 --- a/src/parquet/CMakeLists.txt +++ b/src/parquet/CMakeLists.txt @@ -52,6 +52,7 @@ ADD_PARQUET_TEST(column_writer-test) ADD_PARQUET_TEST(properties-test) ADD_PARQUET_TEST(statistics-test) ADD_PARQUET_TEST(encoding-test) +ADD_PARQUET_TEST(parquet_reader_writer-test) ADD_PARQUET_TEST(public-api-test) ADD_PARQUET_TEST(types-test) ADD_PARQUET_TEST(reader-test) diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc new file mode 100644 index 00000000..5f187f46 --- /dev/null +++ b/src/parquet/parquet_reader_writer-test.cc @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include +#include + +#include + +namespace parquet { + +using parquet::Repetition; +using parquet::Type; +using parquet::LogicalType; +using parquet::schema::PrimitiveNode; +using parquet::schema::GroupNode; + +static int FLBA_LENGTH = 12; +static int NUM_VALUES = 10; + +namespace test { + +template +class TestStatistics : public ::testing::Test { + public: + typedef typename TestType::c_type T; + + void AddNodes(std::string name); + + void SetUpSchema() { + stats_.resize(fields_.size()); + values_.resize(NUM_VALUES); + schema_ = std::static_pointer_cast( + GroupNode::Make("Schema", Repetition::REQUIRED, fields_)); + + parquet_sink_ = std::make_shared(); + } + + void SetValues(); + + void WriteParquet() { + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + builder.created_by("parquet-cpp version 1.3.0"); + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + auto file_writer = parquet::ParquetFileWriter::Open(parquet_sink_, schema_, props); + + // Append a RowGroup with a specific number of rows. + auto rg_writer = file_writer->AppendRowGroup(NUM_VALUES); + + this->SetValues(); + + // Insert Values + for (size_t i = 0; i < fields_.size(); i++) { + auto column_writer = static_cast*>(rg_writer->NextColumn()); + column_writer->WriteBatch(NUM_VALUES, nullptr, nullptr, values_.data()); + } + + } + + void VerifyParquetStats() { + auto pbuffer = parquet_sink_->GetBuffer(); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::Open(std::make_shared(pbuffer)); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + std::shared_ptr rg_metadata = file_metadata->RowGroup(0); + for (size_t i = 0; i < fields_.size(); i++) { + std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); + ASSERT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); + ASSERT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + } + } + + protected: + std::vector values_; + std::vector values_buf_; + std::vector fields_; + std::shared_ptr schema_; + std::shared_ptr parquet_sink_; + std::vector stats_; +}; + +using TestTypes = ::testing::Types; + +// TYPE::INT32 +template <> +void TestStatistics::AddNodes(std::string name) { + // UINT_32 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, + LogicalType::UINT_32)); + // INT_32 logical type to set Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, + LogicalType::INT_32)); +} + +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + } + + // Write UINT32 values and min,max + stats_[0].set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); + + // Write INT32 values and min,max + stats_[1].set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); +} + +// TYPE::INT64 +template <> +void TestStatistics::AddNodes(std::string name) { + // UINT_64 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, + LogicalType::UINT_64)); + // INT_64 logical type to set Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, + LogicalType::INT_64)); +} + +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + } + + // Write UINT64 values and min,max + stats_[0].set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); + + // Write INT64 values and min,max + stats_[1].set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); +} + +// TYPE::INT96 +template <> +void TestStatistics::AddNodes(std::string name) { + // INT96 physical type has only Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT96, + LogicalType::NONE)); +} + +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i].value[0] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i].value[1] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i].value[2] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + } + + // Write Int96 values and min,max + stats_[0].set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); +} + +// TYPE::FLOAT +template <> +void TestStatistics::AddNodes(std::string name) { + // Float physical type has only Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::FLOAT, + LogicalType::NONE)); +} + +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i] = (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; + } + + // Write Float values and min,max + stats_[0].set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); +} + +// TYPE::DOUBLE +template <> +void TestStatistics::AddNodes(std::string name) { + // Double physical type has only Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::DOUBLE, + LogicalType::NONE)); +} + +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i] = (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; + } + + // Write Double values and min,max + stats_[0].set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); +} + +// TYPE::ByteArray +template <> +void TestStatistics::AddNodes(std::string name) { + // UTF8 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::BYTE_ARRAY, + LogicalType::UTF8)); +} + +template <> +void TestStatistics::SetValues() { + int max_byte_array_len = 4; + size_t nbytes = NUM_VALUES * max_byte_array_len; + values_buf_.resize(nbytes); + std::string vals[NUM_VALUES] = {u8"c123", u8"b123", u8"a123", u8"d123", u8"e123", + u8"f123", u8"g123", u8"h123", u8"i123", u8"üa123"}; + + for (int i = 0; i < NUM_VALUES; i++) { + uint8_t *base = &values_buf_.data()[0] + (i * max_byte_array_len); + memcpy(base, vals[i].c_str(), max_byte_array_len); + values_[i].ptr = base; + values_[i].len = max_byte_array_len; + } + + // Write String values and min,max + stats_[0].set_min(std::string(reinterpret_cast(vals[2].c_str()), max_byte_array_len)) + .set_max(std::string(reinterpret_cast(vals[9].c_str()), max_byte_array_len)); +} + +// TYPE::FLBAArray +template <> +void TestStatistics::AddNodes(std::string name) { + // FLBA has only Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, FLBA_LENGTH)); +} + +template <> +void TestStatistics::SetValues() { + size_t nbytes = NUM_VALUES * FLBA_LENGTH; + values_buf_.resize(nbytes); + std::string vals[NUM_VALUES] = {u8"a12345", u8"a123", u8"c123", u8"d123", u8"e123", + u8"f123", u8"g123", u8"h123", u8"üa123", u8"üa12345"}; + + for (int i = 0; i < NUM_VALUES; i++) { + uint8_t *base = &values_buf_.data()[0] + (i * FLBA_LENGTH); + memcpy(base, vals[i].c_str(), FLBA_LENGTH); + values_[i].ptr = base; + } + + // Write String values and min,max + stats_[0].set_min(std::string(reinterpret_cast(vals[1].c_str()), FLBA_LENGTH)) + .set_max(std::string(reinterpret_cast(vals[9].c_str()), FLBA_LENGTH)); +} + + + +TYPED_TEST_CASE(TestStatistics, TestTypes); + +TYPED_TEST(TestStatistics, MinMax) { + this->AddNodes("Column "); + this->SetUpSchema(); + this->WriteParquet(); + this->VerifyParquetStats(); +} + +} // namespace test +} // namespace parquet From c1abb69e4741c090b8c9d103612d4ca282c5c94e Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 9 Aug 2017 12:48:16 -0400 Subject: [PATCH 05/21] rename to Make --- src/parquet/column_writer-test.cc | 2 +- src/parquet/statistics.cc | 4 ++-- src/parquet/util/comparison-test.cc | 1 + src/parquet/util/comparison.cc | 2 +- src/parquet/util/comparison.h | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index 2c3288a3..ee080768 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -152,7 +152,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { this->SetupValuesOut(num_rows); this->ReadColumnFully(compression); std::shared_ptr >compare; - compare = std::static_pointer_cast >(Compare::getComparator(this->descr_)); + compare = std::static_pointer_cast >(Compare::Make(this->descr_)); for (size_t i = 0; i < this->values_.size(); i++) { if ((*compare)(this->values_[i], this->values_out_[i]) || (*compare)(this->values_out_[i], this->values_[i])) { diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc index 43bddaa8..d6e61ef0 100644 --- a/src/parquet/statistics.cc +++ b/src/parquet/statistics.cc @@ -34,7 +34,7 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const ColumnDescriptor* : pool_(pool), min_buffer_(AllocateBuffer(pool_, 0)), max_buffer_(AllocateBuffer(pool_, 0)) { - comparator_ = std::static_pointer_cast >(Compare::getComparator(schema)); + comparator_ = std::static_pointer_cast >(Compare::Make(schema)); SetDescr(schema); Reset(); } @@ -69,7 +69,7 @@ TypedRowGroupStatistics::TypedRowGroupStatistics( IncrementNullCount(null_count); IncrementDistinctCount(distinct_count); - comparator_ = std::static_pointer_cast >(Compare::getComparator(schema)); + comparator_ = std::static_pointer_cast >(Compare::Make(schema)); SetDescr(schema); if (!encoded_min.empty()) { diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index c0ebc088..58e7b660 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -199,6 +199,7 @@ TEST(Comparison, UnsignedInt32) { ASSERT_TRUE(!less(aa, bb) && !less(bb, aa)); ASSERT_TRUE(less(aaa, bbb)); } + } // namespace test } // namespace parquet diff --git a/src/parquet/util/comparison.cc b/src/parquet/util/comparison.cc index 164b101b..06b6b6c5 100644 --- a/src/parquet/util/comparison.cc +++ b/src/parquet/util/comparison.cc @@ -24,7 +24,7 @@ namespace parquet { - std::shared_ptr Compare::getComparator(const ColumnDescriptor* descr){ + std::shared_ptr Compare::Make(const ColumnDescriptor* descr){ if (SortOrder::SIGNED == GetSortOrder(descr->logical_type(), descr->physical_type())) { switch (descr->physical_type()) { case Type::BOOLEAN: diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h index 17016da2..94cf45ba 100644 --- a/src/parquet/util/comparison.h +++ b/src/parquet/util/comparison.h @@ -29,7 +29,7 @@ namespace parquet { class PARQUET_EXPORT Compare { public: virtual ~Compare() {} - static std::shared_ptr getComparator(const ColumnDescriptor* descr); + static std::shared_ptr Make(const ColumnDescriptor* descr); }; // The default comparison is SIGNED From d58658d04a37468ddbe906da6e9c0f5c3f67a8b9 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 9 Aug 2017 13:42:44 -0400 Subject: [PATCH 06/21] make format --- src/parquet/column_writer-test.cc | 5 +- src/parquet/column_writer.cc | 11 +- src/parquet/file/file-metadata-test.cc | 3 +- src/parquet/file/metadata.cc | 21 ++-- src/parquet/parquet_reader_writer-test.cc | 140 ++++++++++++---------- src/parquet/parquet_version.h.in | 2 +- src/parquet/statistics-test.cc | 5 +- src/parquet/statistics.cc | 15 ++- src/parquet/types.cc | 2 +- src/parquet/types.h | 3 +- src/parquet/util/comparison-test.cc | 16 +-- src/parquet/util/comparison.cc | 83 ++++++------- src/parquet/util/comparison.h | 29 ++--- 13 files changed, 177 insertions(+), 158 deletions(-) diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index ee080768..fc3d7cc0 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -151,8 +151,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { void ReadAndCompare(Compression::type compression, int64_t num_rows) { this->SetupValuesOut(num_rows); this->ReadColumnFully(compression); - std::shared_ptr >compare; - compare = std::static_pointer_cast >(Compare::Make(this->descr_)); + std::shared_ptr> compare; + compare = + std::static_pointer_cast>(Compare::Make(this->descr_)); for (size_t i = 0; i < this->values_.size(); i++) { if ((*compare)(this->values_[i], this->values_out_[i]) || (*compare)(this->values_out_[i], this->values_[i])) { diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index f0e85391..fef3676c 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -267,9 +267,10 @@ int64_t ColumnWriter::Close() { FlushBufferedDataPages(); EncodedStatistics chunk_statistics = GetChunkStatistics(); - if (chunk_statistics.is_set()) metadata_->SetStatistics( - SortOrder::SIGNED == GetSortOrder(descr_->logical_type(), - descr_->physical_type()), chunk_statistics); + if (chunk_statistics.is_set()) + metadata_->SetStatistics(SortOrder::SIGNED == GetSortOrder(descr_->logical_type(), + descr_->physical_type()), + chunk_statistics); pager_->Close(has_dictionary_, fallback_); } @@ -320,8 +321,8 @@ TypedColumnWriter::TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata, } if (properties->statistics_enabled(descr_->path()) && - (SortOrder::UNKNOWN != GetSortOrder(descr_->logical_type(), - descr_->physical_type())) ) { + (SortOrder::UNKNOWN != + GetSortOrder(descr_->logical_type(), descr_->physical_type()))) { page_statistics_ = std::unique_ptr(new TypedStats(descr_, allocator_)); chunk_statistics_ = std::unique_ptr(new TypedStats(descr_, allocator_)); } diff --git a/src/parquet/file/file-metadata-test.cc b/src/parquet/file/file-metadata-test.cc index 318b282a..5bd84197 100644 --- a/src/parquet/file/file-metadata-test.cc +++ b/src/parquet/file/file-metadata-test.cc @@ -219,7 +219,8 @@ TEST(ApplicationVersion, Basics) { ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, SortOrder::SIGNED)); ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED)); ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED)); - ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED)); + ASSERT_TRUE( + version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED)); } } // namespace metadata diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc index a31a361b..7f17d0f2 100644 --- a/src/parquet/file/metadata.cc +++ b/src/parquet/file/metadata.cc @@ -45,8 +45,7 @@ static std::shared_ptr MakeTypedColumnStats( return std::make_shared>( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - true); + metadata.statistics.null_count, metadata.statistics.distinct_count, true); } return std::make_shared>( descr, metadata.statistics.min, metadata.statistics.max, @@ -116,8 +115,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline bool is_stats_set() const { DCHECK(writer_version_ != nullptr); return column_->meta_data.__isset.statistics && - writer_version_->HasCorrectStatistics(type(), - GetSortOrder(descr_->logical_type(), descr_->physical_type())); + writer_version_->HasCorrectStatistics( + type(), GetSortOrder(descr_->logical_type(), descr_->physical_type())); } inline std::shared_ptr statistics() const { @@ -490,7 +489,8 @@ bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) cons // Reference: // parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java // PARQUET-686 has more disussion on statistics -bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, SortOrder::type sort_order) const { +bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, + SortOrder::type sort_order) const { // Parquet cpp version 1.2.1 onwards stats are computed correctly for all types if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION))) { // Only SIGNED are valid @@ -549,10 +549,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { stats.__isset.null_count = val.has_null_count; stats.__isset.distinct_count = val.has_distinct_count; if (is_signed) { - stats.max = val.max(); - stats.min = val.min(); - stats.__isset.min = val.has_min; - stats.__isset.max = val.has_max; + stats.max = val.max(); + stats.min = val.min(); + stats.__isset.min = val.has_min; + stats.__isset.max = val.has_max; } column_chunk_->meta_data.__set_statistics(stats); @@ -641,7 +641,8 @@ const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const { return impl_->descr(); } -void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed, const EncodedStatistics& result) { +void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed, + const EncodedStatistics& result) { impl_->SetStatistics(is_signed, result); } diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index 5f187f46..41bd9dba 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -48,7 +48,7 @@ class TestStatistics : public ::testing::Test { stats_.resize(fields_.size()); values_.resize(NUM_VALUES); schema_ = std::static_pointer_cast( - GroupNode::Make("Schema", Repetition::REQUIRED, fields_)); + GroupNode::Make("Schema", Repetition::REQUIRED, fields_)); parquet_sink_ = std::make_shared(); } @@ -72,10 +72,10 @@ class TestStatistics : public ::testing::Test { // Insert Values for (size_t i = 0; i < fields_.size(); i++) { - auto column_writer = static_cast*>(rg_writer->NextColumn()); + auto column_writer = + static_cast*>(rg_writer->NextColumn()); column_writer->WriteBatch(NUM_VALUES, nullptr, nullptr, values_.data()); } - } void VerifyParquetStats() { @@ -83,13 +83,15 @@ class TestStatistics : public ::testing::Test { // Create a ParquetReader instance std::unique_ptr parquet_reader = - parquet::ParquetFileReader::Open(std::make_shared(pbuffer)); + parquet::ParquetFileReader::Open( + std::make_shared(pbuffer)); // Get the File MetaData std::shared_ptr file_metadata = parquet_reader->metadata(); std::shared_ptr rg_metadata = file_metadata->RowGroup(0); for (size_t i = 0; i < fields_.size(); i++) { - std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); + std::shared_ptr cc_metadata = + rg_metadata->ColumnChunk(i); ASSERT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); ASSERT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); } @@ -110,120 +112,129 @@ using TestTypes = ::testing::Types void TestStatistics::AddNodes(std::string name) { - // UINT_32 logical type to set Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, - LogicalType::UINT_32)); - // INT_32 logical type to set Signed Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, - LogicalType::INT_32)); + // UINT_32 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, + LogicalType::UINT_32)); + // INT_32 logical type to set Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, + LogicalType::INT_32)); } template <> void TestStatistics::SetValues() { for (int i = 0; i < NUM_VALUES; i++) { - values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; } // Write UINT32 values and min,max - stats_[0].set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); // Write INT32 values and min,max - stats_[1].set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); + stats_[1] + .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); } // TYPE::INT64 template <> void TestStatistics::AddNodes(std::string name) { - // UINT_64 logical type to set Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, - LogicalType::UINT_64)); - // INT_64 logical type to set Signed Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, - LogicalType::INT_64)); + // UINT_64 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, + LogicalType::UINT_64)); + // INT_64 logical type to set Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, + LogicalType::INT_64)); } template <> void TestStatistics::SetValues() { for (int i = 0; i < NUM_VALUES; i++) { - values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; } // Write UINT64 values and min,max - stats_[0].set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); // Write INT64 values and min,max - stats_[1].set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); + stats_[1] + .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); } // TYPE::INT96 template <> void TestStatistics::AddNodes(std::string name) { - // INT96 physical type has only Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT96, - LogicalType::NONE)); + // INT96 physical type has only Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT96, + LogicalType::NONE)); } template <> void TestStatistics::SetValues() { for (int i = 0; i < NUM_VALUES; i++) { - values_[i].value[0] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; - values_[i].value[1] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; - values_[i].value[2] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i].value[0] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i].value[1] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i].value[2] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; } // Write Int96 values and min,max - stats_[0].set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); } // TYPE::FLOAT template <> void TestStatistics::AddNodes(std::string name) { - // Float physical type has only Signed Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::FLOAT, - LogicalType::NONE)); + // Float physical type has only Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::FLOAT, + LogicalType::NONE)); } template <> void TestStatistics::SetValues() { for (int i = 0; i < NUM_VALUES; i++) { - values_[i] = (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; + values_[i] = + (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; } // Write Float values and min,max - stats_[0].set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); } // TYPE::DOUBLE template <> void TestStatistics::AddNodes(std::string name) { - // Double physical type has only Signed Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::DOUBLE, - LogicalType::NONE)); + // Double physical type has only Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::DOUBLE, + LogicalType::NONE)); } template <> void TestStatistics::SetValues() { for (int i = 0; i < NUM_VALUES; i++) { - values_[i] = (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; + values_[i] = + (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; } // Write Double values and min,max - stats_[0].set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); } // TYPE::ByteArray template <> void TestStatistics::AddNodes(std::string name) { - // UTF8 logical type to set Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::BYTE_ARRAY, - LogicalType::UTF8)); + // UTF8 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, + Type::BYTE_ARRAY, LogicalType::UTF8)); } template <> @@ -235,45 +246,48 @@ void TestStatistics::SetValues() { u8"f123", u8"g123", u8"h123", u8"i123", u8"üa123"}; for (int i = 0; i < NUM_VALUES; i++) { - uint8_t *base = &values_buf_.data()[0] + (i * max_byte_array_len); + uint8_t* base = &values_buf_.data()[0] + (i * max_byte_array_len); memcpy(base, vals[i].c_str(), max_byte_array_len); values_[i].ptr = base; values_[i].len = max_byte_array_len; } // Write String values and min,max - stats_[0].set_min(std::string(reinterpret_cast(vals[2].c_str()), max_byte_array_len)) - .set_max(std::string(reinterpret_cast(vals[9].c_str()), max_byte_array_len)); + stats_[0] + .set_min( + std::string(reinterpret_cast(vals[2].c_str()), max_byte_array_len)) + .set_max(std::string(reinterpret_cast(vals[9].c_str()), + max_byte_array_len)); } // TYPE::FLBAArray template <> void TestStatistics::AddNodes(std::string name) { - // FLBA has only Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, FLBA_LENGTH)); + // FLBA has only Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, FLBA_LENGTH)); } template <> void TestStatistics::SetValues() { size_t nbytes = NUM_VALUES * FLBA_LENGTH; values_buf_.resize(nbytes); - std::string vals[NUM_VALUES] = {u8"a12345", u8"a123", u8"c123", u8"d123", u8"e123", - u8"f123", u8"g123", u8"h123", u8"üa123", u8"üa12345"}; + std::string vals[NUM_VALUES] = {u8"a12345", u8"a123", u8"c123", u8"d123", u8"e123", + u8"f123", u8"g123", u8"h123", u8"üa123", u8"üa12345"}; for (int i = 0; i < NUM_VALUES; i++) { - uint8_t *base = &values_buf_.data()[0] + (i * FLBA_LENGTH); + uint8_t* base = &values_buf_.data()[0] + (i * FLBA_LENGTH); memcpy(base, vals[i].c_str(), FLBA_LENGTH); values_[i].ptr = base; } // Write String values and min,max - stats_[0].set_min(std::string(reinterpret_cast(vals[1].c_str()), FLBA_LENGTH)) - .set_max(std::string(reinterpret_cast(vals[9].c_str()), FLBA_LENGTH)); + stats_[0] + .set_min(std::string(reinterpret_cast(vals[1].c_str()), FLBA_LENGTH)) + .set_max(std::string(reinterpret_cast(vals[9].c_str()), FLBA_LENGTH)); } - - TYPED_TEST_CASE(TestStatistics, TestTypes); TYPED_TEST(TestStatistics, MinMax) { diff --git a/src/parquet/parquet_version.h.in b/src/parquet/parquet_version.h.in index 7036d2fa..db8f396a 100644 --- a/src/parquet/parquet_version.h.in +++ b/src/parquet/parquet_version.h.in @@ -21,4 +21,4 @@ // define the parquet created by version #define CREATED_BY_VERSION "parquet-cpp version @PARQUET_VERSION@" -#endif // PARQUET_VERSION_H +#endif // PARQUET_VERSION_H diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc index e36a33f1..02489407 100644 --- a/src/parquet/statistics-test.cc +++ b/src/parquet/statistics-test.cc @@ -194,9 +194,8 @@ bool* TestRowGroupStatistics::GetValuesPointer(std::vector& v } template -typename std::vector -TestRowGroupStatistics::GetDeepCopy( - const std::vector& values) { +typename std::vector TestRowGroupStatistics< + TestType>::GetDeepCopy(const std::vector& values) { return values; } diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc index d6e61ef0..ff5af791 100644 --- a/src/parquet/statistics.cc +++ b/src/parquet/statistics.cc @@ -103,14 +103,17 @@ void TypedRowGroupStatistics::Update(const T* values, int64_t num_not_nul // TODO: support distinct count? if (num_not_null == 0) return; - auto batch_minmax = std::minmax_element(values, values + num_not_null, std::ref(*(this->comparator_))); + auto batch_minmax = + std::minmax_element(values, values + num_not_null, std::ref(*(this->comparator_))); if (!has_min_max_) { has_min_max_ = true; Copy(*batch_minmax.first, &min_, min_buffer_.get()); Copy(*batch_minmax.second, &max_, max_buffer_.get()); } else { - Copy(std::min(min_, *batch_minmax.first, std::ref(*(this->comparator_))), &min_, min_buffer_.get()); - Copy(std::max(max_, *batch_minmax.second, std::ref(*(this->comparator_))), &max_, max_buffer_.get()); + Copy(std::min(min_, *batch_minmax.first, std::ref(*(this->comparator_))), &min_, + min_buffer_.get()); + Copy(std::max(max_, *batch_minmax.second, std::ref(*(this->comparator_))), &max_, + max_buffer_.get()); } } @@ -184,8 +187,10 @@ void TypedRowGroupStatistics::Merge(const TypedRowGroupStatistics& return; } - Copy(std::min(this->min_, other.min_, std::ref(*(this->comparator_))), &this->min_, min_buffer_.get()); - Copy(std::max(this->max_, other.max_, std::ref(*(this->comparator_))), &this->max_, max_buffer_.get()); + Copy(std::min(this->min_, other.min_, std::ref(*(this->comparator_))), &this->min_, + min_buffer_.get()); + Copy(std::max(this->max_, other.max_, std::ref(*(this->comparator_))), &this->max_, + max_buffer_.get()); } template diff --git a/src/parquet/types.cc b/src/parquet/types.cc index 515026cd..0652c6a8 100644 --- a/src/parquet/types.cc +++ b/src/parquet/types.cc @@ -267,7 +267,7 @@ SortOrder::type GetSortOrder(LogicalType::type converted, Type::type primitive) case LogicalType::MAP_KEY_VALUE: case LogicalType::INTERVAL: case LogicalType::NONE: // required instead of default - case LogicalType::NA: // required instead of default + case LogicalType::NA: // required instead of default return SortOrder::UNKNOWN; } return SortOrder::UNKNOWN; diff --git a/src/parquet/types.h b/src/parquet/types.h index 9a397163..3b1a0afe 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -298,7 +298,8 @@ PARQUET_EXPORT int GetTypeByteSize(Type::type t); SortOrder::type PARQUET_EXPORT DefaultSortOrder(Type::type primitive); -SortOrder::type PARQUET_EXPORT GetSortOrder(LogicalType::type converted, Type::type primitive); +SortOrder::type PARQUET_EXPORT GetSortOrder(LogicalType::type converted, + Type::type primitive); } // namespace parquet diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index 58e7b660..97aaae5c 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -43,7 +43,8 @@ static FLBA FLBAFromString(const std::string& s) { } TEST(Comparison, signedByteArray) { - NodePtr node = PrimitiveNode::Make("SignedByteArray", Repetition::REQUIRED, Type::BYTE_ARRAY); + NodePtr node = + PrimitiveNode::Make("SignedByteArray", Repetition::REQUIRED, Type::BYTE_ARRAY); ColumnDescriptor descr(node, 0, 0); CompareDefaultByteArray less; @@ -64,7 +65,8 @@ TEST(Comparison, signedByteArray) { } TEST(Comparison, UnsignedByteArray) { - NodePtr node = PrimitiveNode::Make("UnsignedByteArray", Repetition::REQUIRED, Type::BYTE_ARRAY, LogicalType::UTF8); + NodePtr node = PrimitiveNode::Make("UnsignedByteArray", Repetition::REQUIRED, + Type::BYTE_ARRAY, LogicalType::UTF8); ColumnDescriptor descr(node, 0, 0); // Check if UTF-8 is compared using unsigned correctly @@ -85,9 +87,8 @@ TEST(Comparison, UnsignedByteArray) { TEST(Comparison, SignedFLBA) { int size = 10; - NodePtr node = - PrimitiveNode::Make("SignedFLBA", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, size); + NodePtr node = PrimitiveNode::Make("SignedFLBA", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, size); ColumnDescriptor descr(node, 0, 0); CompareDefaultFLBA less(descr.type_length()); @@ -107,9 +108,8 @@ TEST(Comparison, SignedFLBA) { TEST(Comparison, UnsignedFLBA) { int size = 10; - NodePtr node = - PrimitiveNode::Make("UnsignedFLBA", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, size); + NodePtr node = PrimitiveNode::Make("UnsignedFLBA", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, size); ColumnDescriptor descr(node, 0, 0); CompareUnsignedFLBA uless(descr.type_length()); diff --git a/src/parquet/util/comparison.cc b/src/parquet/util/comparison.cc index 06b6b6c5..068d0338 100644 --- a/src/parquet/util/comparison.cc +++ b/src/parquet/util/comparison.cc @@ -24,48 +24,49 @@ namespace parquet { - std::shared_ptr Compare::Make(const ColumnDescriptor* descr){ - if (SortOrder::SIGNED == GetSortOrder(descr->logical_type(), descr->physical_type())) { - switch (descr->physical_type()) { - case Type::BOOLEAN: - return std::make_shared(); - case Type::INT32: - return std::make_shared(); - case Type::INT64: - return std::make_shared(); - case Type::INT96: - return std::make_shared(); - case Type::FLOAT: - return std::make_shared(); - case Type::DOUBLE: - return std::make_shared(); - case Type::BYTE_ARRAY: - return std::make_shared(); - case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared(descr->type_length()); - default: - ParquetException::NYI("Signed Compare not implemented"); - } - } else if (SortOrder::UNSIGNED == GetSortOrder(descr->logical_type(), descr->physical_type())) { - switch (descr->physical_type()) { - case Type::INT32: - return std::make_shared(); - case Type::INT64: - return std::make_shared(); - case Type::INT96: - return std::make_shared(); - case Type::BYTE_ARRAY: - return std::make_shared(); - case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared(descr->type_length()); - default: - ParquetException::NYI("Unsigned Compare not implemented"); - } - } else { - throw ParquetException("UNKNOWN Sort Order"); - } - return nullptr; +std::shared_ptr Compare::Make(const ColumnDescriptor* descr) { + if (SortOrder::SIGNED == GetSortOrder(descr->logical_type(), descr->physical_type())) { + switch (descr->physical_type()) { + case Type::BOOLEAN: + return std::make_shared(); + case Type::INT32: + return std::make_shared(); + case Type::INT64: + return std::make_shared(); + case Type::INT96: + return std::make_shared(); + case Type::FLOAT: + return std::make_shared(); + case Type::DOUBLE: + return std::make_shared(); + case Type::BYTE_ARRAY: + return std::make_shared(); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_shared(descr->type_length()); + default: + ParquetException::NYI("Signed Compare not implemented"); + } + } else if (SortOrder::UNSIGNED == + GetSortOrder(descr->logical_type(), descr->physical_type())) { + switch (descr->physical_type()) { + case Type::INT32: + return std::make_shared(); + case Type::INT64: + return std::make_shared(); + case Type::INT96: + return std::make_shared(); + case Type::BYTE_ARRAY: + return std::make_shared(); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_shared(descr->type_length()); + default: + ParquetException::NYI("Unsigned Compare not implemented"); + } + } else { + throw ParquetException("UNKNOWN Sort Order"); } + return nullptr; +} template class PARQUET_TEMPLATE_EXPORT CompareDefault; template class PARQUET_TEMPLATE_EXPORT CompareDefault; diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h index 94cf45ba..7bef66d2 100644 --- a/src/parquet/util/comparison.h +++ b/src/parquet/util/comparison.h @@ -39,12 +39,10 @@ class PARQUET_EXPORT CompareDefault : public Compare { typedef typename DType::c_type T; CompareDefault() {} virtual ~CompareDefault() {} - virtual bool operator()(const T& a, const T& b) { - return a < b; - } + virtual bool operator()(const T& a, const T& b) { return a < b; } }; -template<> +template <> class PARQUET_EXPORT CompareDefault : public Compare { public: CompareDefault() {} @@ -56,7 +54,7 @@ class PARQUET_EXPORT CompareDefault : public Compare { } }; -template<> +template <> class PARQUET_EXPORT CompareDefault : public Compare { public: CompareDefault() {} @@ -64,12 +62,11 @@ class PARQUET_EXPORT CompareDefault : public Compare { virtual bool operator()(const ByteArray& a, const ByteArray& b) { const int8_t* aptr = reinterpret_cast(a.ptr); const int8_t* bptr = reinterpret_cast(b.ptr); - return std::lexicographical_compare( - aptr, aptr + a.len, bptr, bptr + b.len); + return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len); } }; -template<> +template <> class PARQUET_EXPORT CompareDefault : public Compare { public: explicit CompareDefault(int length) : type_length_(length) {} @@ -77,8 +74,8 @@ class PARQUET_EXPORT CompareDefault : public Compare { virtual bool operator()(const FLBA& a, const FLBA& b) { const int8_t* aptr = reinterpret_cast(a.ptr); const int8_t* bptr = reinterpret_cast(b.ptr); - return std::lexicographical_compare( - aptr, aptr + type_length_, bptr, bptr + type_length_); + return std::lexicographical_compare(aptr, aptr + type_length_, bptr, + bptr + type_length_); } int32_t type_length_; }; @@ -112,7 +109,7 @@ PARQUET_EXTERN_TEMPLATE CompareDefault; // Define Unsigned Comparators class PARQUET_EXPORT CompareUnsignedInt32 : public CompareDefaultInt32 { - public: + public: virtual ~CompareUnsignedInt32() {} bool operator()(const int32_t& a, const int32_t& b) override { const uint32_t ua = a; @@ -122,7 +119,7 @@ class PARQUET_EXPORT CompareUnsignedInt32 : public CompareDefaultInt32 { }; class PARQUET_EXPORT CompareUnsignedInt64 : public CompareDefaultInt64 { - public: + public: virtual ~CompareUnsignedInt64() {} bool operator()(const int64_t& a, const int64_t& b) override { const uint64_t ua = a; @@ -131,7 +128,6 @@ class PARQUET_EXPORT CompareUnsignedInt64 : public CompareDefaultInt64 { } }; - class PARQUET_EXPORT CompareUnsignedInt96 : public CompareDefaultInt96 { public: virtual ~CompareUnsignedInt96() {} @@ -148,8 +144,7 @@ class PARQUET_EXPORT CompareUnsignedByteArray : public CompareDefaultByteArray { bool operator()(const ByteArray& a, const ByteArray& b) override { const uint8_t* aptr = reinterpret_cast(a.ptr); const uint8_t* bptr = reinterpret_cast(b.ptr); - return std::lexicographical_compare( - aptr, aptr + a.len, bptr, bptr + b.len); + return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len); } }; @@ -160,8 +155,8 @@ class PARQUET_EXPORT CompareUnsignedFLBA : public CompareDefaultFLBA { bool operator()(const FLBA& a, const FLBA& b) override { const uint8_t* aptr = reinterpret_cast(a.ptr); const uint8_t* bptr = reinterpret_cast(b.ptr); - return std::lexicographical_compare( - aptr, aptr + type_length_, bptr, bptr + type_length_); + return std::lexicographical_compare(aptr, aptr + type_length_, bptr, + bptr + type_length_); } }; From b4fba8bc8806e2d5d0fd55c1a842d5f6fc3b8fbc Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 9 Aug 2017 15:00:41 -0400 Subject: [PATCH 07/21] Add test for Unknown sort order --- src/parquet/parquet_reader_writer-test.cc | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index 41bd9dba..a4a25eae 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -297,5 +297,29 @@ TYPED_TEST(TestStatistics, MinMax) { this->VerifyParquetStats(); } +// Ensure UNKNOWN sort order is handled properly +using TestStatisticsFLBA = TestStatistics; + +TEST_F(TestStatisticsFLBA, UnknownSortOrder) { + this->fields_.push_back(schema::PrimitiveNode::Make("Column 0", Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::INTERVAL, FLBA_LENGTH)); + this->SetUpSchema(); + this->WriteParquet(); + + auto pbuffer = parquet_sink_->GetBuffer(); + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::Open( + std::make_shared(pbuffer)); + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + std::shared_ptr rg_metadata = file_metadata->RowGroup(0); + std::shared_ptr cc_metadata = + rg_metadata->ColumnChunk(0); + // stats should not be set + ASSERT_FALSE(cc_metadata->is_stats_set()); +} + } // namespace test } // namespace parquet From 55960fe5e0d77359695f15e9cdac5054ed21339c Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 9 Aug 2017 15:01:20 -0400 Subject: [PATCH 08/21] format --- src/parquet/parquet_reader_writer-test.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index a4a25eae..27867529 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -301,9 +301,9 @@ TYPED_TEST(TestStatistics, MinMax) { using TestStatisticsFLBA = TestStatistics; TEST_F(TestStatisticsFLBA, UnknownSortOrder) { - this->fields_.push_back(schema::PrimitiveNode::Make("Column 0", Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::INTERVAL, FLBA_LENGTH)); + this->fields_.push_back(schema::PrimitiveNode::Make( + "Column 0", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, + FLBA_LENGTH)); this->SetUpSchema(); this->WriteParquet(); @@ -311,12 +311,11 @@ TEST_F(TestStatisticsFLBA, UnknownSortOrder) { // Create a ParquetReader instance std::unique_ptr parquet_reader = parquet::ParquetFileReader::Open( - std::make_shared(pbuffer)); + std::make_shared(pbuffer)); // Get the File MetaData std::shared_ptr file_metadata = parquet_reader->metadata(); std::shared_ptr rg_metadata = file_metadata->RowGroup(0); - std::shared_ptr cc_metadata = - rg_metadata->ColumnChunk(0); + std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(0); // stats should not be set ASSERT_FALSE(cc_metadata->is_stats_set()); } From 808e764134383552206463cc9211de4161ed1818 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 9 Aug 2017 15:21:04 -0400 Subject: [PATCH 09/21] fix failure --- src/parquet/parquet_reader_writer-test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index 27867529..6971b0db 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -243,7 +243,7 @@ void TestStatistics::SetValues() { size_t nbytes = NUM_VALUES * max_byte_array_len; values_buf_.resize(nbytes); std::string vals[NUM_VALUES] = {u8"c123", u8"b123", u8"a123", u8"d123", u8"e123", - u8"f123", u8"g123", u8"h123", u8"i123", u8"üa123"}; + u8"f123", u8"g123", u8"h123", u8"i123", u8"ü123"}; for (int i = 0; i < NUM_VALUES; i++) { uint8_t* base = &values_buf_.data()[0] + (i * max_byte_array_len); @@ -273,19 +273,19 @@ template <> void TestStatistics::SetValues() { size_t nbytes = NUM_VALUES * FLBA_LENGTH; values_buf_.resize(nbytes); - std::string vals[NUM_VALUES] = {u8"a12345", u8"a123", u8"c123", u8"d123", u8"e123", - u8"f123", u8"g123", u8"h123", u8"üa123", u8"üa12345"}; + std::string vals[NUM_VALUES] = {u8"b12345", u8"aü123456789", u8"c123", u8"d123", u8"e123", + u8"f123", u8"g123", u8"h123", u8"üa123", u8"üa123456789"}; for (int i = 0; i < NUM_VALUES; i++) { uint8_t* base = &values_buf_.data()[0] + (i * FLBA_LENGTH); - memcpy(base, vals[i].c_str(), FLBA_LENGTH); + memcpy(base, vals[i].c_str(), vals[i].length()); values_[i].ptr = base; } // Write String values and min,max stats_[0] - .set_min(std::string(reinterpret_cast(vals[1].c_str()), FLBA_LENGTH)) - .set_max(std::string(reinterpret_cast(vals[9].c_str()), FLBA_LENGTH)); + .set_min(std::string(reinterpret_cast(vals[1].c_str()), vals[1].length())) + .set_max(std::string(reinterpret_cast(vals[9].c_str()), vals[9].length())); } TYPED_TEST_CASE(TestStatistics, TestTypes); From 6a985de1b4337d937c5834851e65b31926022880 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 9 Aug 2017 15:44:07 -0400 Subject: [PATCH 10/21] Comments --- src/parquet/file/metadata.cc | 4 ++++ src/parquet/parquet_reader_writer-test.cc | 21 +++++++++++---------- src/parquet/statistics-test.cc | 2 ++ 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc index 7f17d0f2..9ed6e193 100644 --- a/src/parquet/file/metadata.cc +++ b/src/parquet/file/metadata.cc @@ -41,12 +41,14 @@ const ApplicationVersion ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION = template static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { + // If new fields max_value/min_value are set, then return them. if (metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value) { return std::make_shared>( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, true); } + // Default behavior return std::make_shared>( descr, metadata.statistics.min, metadata.statistics.max, metadata.num_values - metadata.statistics.null_count, @@ -548,6 +550,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { stats.__isset.max_value = val.has_max; stats.__isset.null_count = val.has_null_count; stats.__isset.distinct_count = val.has_distinct_count; + // If the order is SIGNED, then the old min/max values must be set too. + // This for backward compatibility if (is_signed) { stats.max = val.max(); stats.min = val.min(); diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index 6971b0db..d601818a 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -126,12 +126,12 @@ void TestStatistics::SetValues() { values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; } - // Write UINT32 values and min,max + // Write UINT32 min/max values stats_[0] .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); - // Write INT32 values and min,max + // Write INT32 min/max values stats_[1] .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); @@ -154,12 +154,12 @@ void TestStatistics::SetValues() { values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; } - // Write UINT64 values and min,max + // Write UINT64 min/max values stats_[0] .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); - // Write INT64 values and min,max + // Write INT64 min/max values stats_[1] .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); @@ -181,7 +181,7 @@ void TestStatistics::SetValues() { values_[i].value[2] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; } - // Write Int96 values and min,max + // Write Int96 min/max values stats_[0] .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); @@ -202,7 +202,7 @@ void TestStatistics::SetValues() { (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; } - // Write Float values and min,max + // Write Float min/max values stats_[0] .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); @@ -223,7 +223,7 @@ void TestStatistics::SetValues() { (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; } - // Write Double values and min,max + // Write Double min/max values stats_[0] .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); @@ -252,7 +252,7 @@ void TestStatistics::SetValues() { values_[i].len = max_byte_array_len; } - // Write String values and min,max + // Write String min/max values stats_[0] .set_min( std::string(reinterpret_cast(vals[2].c_str()), max_byte_array_len)) @@ -282,7 +282,7 @@ void TestStatistics::SetValues() { values_[i].ptr = base; } - // Write String values and min,max + // Write FLBA min,max values stats_[0] .set_min(std::string(reinterpret_cast(vals[1].c_str()), vals[1].length())) .set_max(std::string(reinterpret_cast(vals[9].c_str()), vals[9].length())); @@ -316,7 +316,8 @@ TEST_F(TestStatisticsFLBA, UnknownSortOrder) { std::shared_ptr file_metadata = parquet_reader->metadata(); std::shared_ptr rg_metadata = file_metadata->RowGroup(0); std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(0); - // stats should not be set + + // stats should not be set for UNKNOWN sort order ASSERT_FALSE(cc_metadata->is_stats_set()); } diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc index 02489407..44cc1cce 100644 --- a/src/parquet/statistics-test.cc +++ b/src/parquet/statistics-test.cc @@ -310,6 +310,7 @@ TYPED_TEST(TestNumericRowGroupStatistics, Merge) { this->TestMerge(); } +// Statistics are restricted for few types in older parquet version TEST(CorruptStatistics, Basics) { ApplicationVersion version("parquet-mr version 1.8.0"); SchemaDescriptor schema; @@ -355,6 +356,7 @@ TEST(CorruptStatistics, Basics) { ASSERT_FALSE(column_chunk6->is_stats_set()); } +// Statistics for all types have no restrictions in newer parquet version TEST(CorrectStatistics, Basics) { ApplicationVersion version("parquet-cpp version 1.2.1"); SchemaDescriptor schema; From bdd8d37a560ac4b34a646fda58a84784ef6e12fb Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 9 Aug 2017 16:11:46 -0400 Subject: [PATCH 11/21] Add another test --- src/parquet/parquet_reader_writer-test.cc | 11 +++++++---- src/parquet/util/comparison-test.cc | 9 +++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index d601818a..50806641 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -273,8 +273,9 @@ template <> void TestStatistics::SetValues() { size_t nbytes = NUM_VALUES * FLBA_LENGTH; values_buf_.resize(nbytes); - std::string vals[NUM_VALUES] = {u8"b12345", u8"aü123456789", u8"c123", u8"d123", u8"e123", - u8"f123", u8"g123", u8"h123", u8"üa123", u8"üa123456789"}; + std::string vals[NUM_VALUES] = {u8"b12345", u8"aü123456789", u8"c123", u8"d123", + u8"e123", u8"f123", u8"g123", u8"h123", + u8"üa123", u8"üa123456789"}; for (int i = 0; i < NUM_VALUES; i++) { uint8_t* base = &values_buf_.data()[0] + (i * FLBA_LENGTH); @@ -284,8 +285,10 @@ void TestStatistics::SetValues() { // Write FLBA min,max values stats_[0] - .set_min(std::string(reinterpret_cast(vals[1].c_str()), vals[1].length())) - .set_max(std::string(reinterpret_cast(vals[9].c_str()), vals[9].length())); + .set_min( + std::string(reinterpret_cast(vals[1].c_str()), vals[1].length())) + .set_max( + std::string(reinterpret_cast(vals[9].c_str()), vals[9].length())); } TYPED_TEST_CASE(TestStatistics, TestTypes); diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index 97aaae5c..4f51df81 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -200,6 +200,15 @@ TEST(Comparison, UnsignedInt32) { ASSERT_TRUE(less(aaa, bbb)); } +TEST(Comparison, UnknownSortOrder) { + NodePtr node = + PrimitiveNode::Make("Unknown", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::INTERVAL, 12); + ColumnDescriptor descr(node, 0, 0); + + ASSERT_THROW(Compare::Make(&descr), ParquetException); +} + } // namespace test } // namespace parquet From 85a9052767f82b77f6c4f0a3782d98c77e734373 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 9 Aug 2017 17:52:13 -0400 Subject: [PATCH 12/21] fix Clang failure and improve test Fix Warnings on Windows --- src/parquet/parquet_reader_writer-test.cc | 8 ++++---- src/parquet/util/comparison-test.cc | 13 +++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index 50806641..44c0d105 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -71,7 +71,7 @@ class TestStatistics : public ::testing::Test { this->SetValues(); // Insert Values - for (size_t i = 0; i < fields_.size(); i++) { + for (int i = 0; i < static_cast(fields_.size()); i++) { auto column_writer = static_cast*>(rg_writer->NextColumn()); column_writer->WriteBatch(NUM_VALUES, nullptr, nullptr, values_.data()); @@ -89,7 +89,7 @@ class TestStatistics : public ::testing::Test { // Get the File MetaData std::shared_ptr file_metadata = parquet_reader->metadata(); std::shared_ptr rg_metadata = file_metadata->RowGroup(0); - for (size_t i = 0; i < fields_.size(); i++) { + for (int i = 0; i < static_cast(fields_.size()); i++) { std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); ASSERT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); @@ -242,7 +242,7 @@ void TestStatistics::SetValues() { int max_byte_array_len = 4; size_t nbytes = NUM_VALUES * max_byte_array_len; values_buf_.resize(nbytes); - std::string vals[NUM_VALUES] = {u8"c123", u8"b123", u8"a123", u8"d123", u8"e123", + std::vector vals = {u8"c123", u8"b123", u8"a123", u8"d123", u8"e123", u8"f123", u8"g123", u8"h123", u8"i123", u8"ü123"}; for (int i = 0; i < NUM_VALUES; i++) { @@ -273,7 +273,7 @@ template <> void TestStatistics::SetValues() { size_t nbytes = NUM_VALUES * FLBA_LENGTH; values_buf_.resize(nbytes); - std::string vals[NUM_VALUES] = {u8"b12345", u8"aü123456789", u8"c123", u8"d123", + std::vector vals = {u8"b12345", u8"aü123456789", u8"c123", u8"d123", u8"e123", u8"f123", u8"g123", u8"h123", u8"üa123", u8"üa123456789"}; diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index 4f51df81..a0423cb7 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -83,6 +83,19 @@ TEST(Comparison, UnsignedByteArray) { s1ba = ByteArrayFromString(s1); s2ba = ByteArrayFromString(s2); ASSERT_TRUE(uless(s1ba, s2ba)); + + // Multi-byte UTF-8 characters + s1 = u8"ünk123456"; // ü = 252 + s2 = u8"ănk123456"; // ă = 259 + s1ba = ByteArrayFromString(s1); + s2ba = ByteArrayFromString(s2); + ASSERT_TRUE(uless(s1ba, s2ba)); + + s1 = u8"Ănk123456"; // Ă = 258 + s2 = u8"ănk123456"; // ă = 259 + s1ba = ByteArrayFromString(s1); + s2ba = ByteArrayFromString(s2); + ASSERT_TRUE(uless(s1ba, s2ba)); } TEST(Comparison, SignedFLBA) { From 17a79f384ecec9f11a303cc9faefae16674973b6 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Thu, 10 Aug 2017 12:39:27 -0400 Subject: [PATCH 13/21] fix failures --- src/parquet/parquet_reader_writer-test.cc | 32 ++++++++++++----------- src/parquet/parquet_version.h.in | 2 +- src/parquet/util/comparison-test.cc | 6 ++++- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index 44c0d105..beccfb88 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -32,8 +32,8 @@ using parquet::LogicalType; using parquet::schema::PrimitiveNode; using parquet::schema::GroupNode; -static int FLBA_LENGTH = 12; -static int NUM_VALUES = 10; +static const int FLBA_LENGTH = 12; +static const int NUM_VALUES = 10; namespace test { @@ -239,25 +239,26 @@ void TestStatistics::AddNodes(std::string name) { template <> void TestStatistics::SetValues() { - int max_byte_array_len = 4; + int max_byte_array_len = 10; size_t nbytes = NUM_VALUES * max_byte_array_len; values_buf_.resize(nbytes); std::vector vals = {u8"c123", u8"b123", u8"a123", u8"d123", u8"e123", u8"f123", u8"g123", u8"h123", u8"i123", u8"ü123"}; + uint8_t* base = &values_buf_.data()[0]; for (int i = 0; i < NUM_VALUES; i++) { - uint8_t* base = &values_buf_.data()[0] + (i * max_byte_array_len); - memcpy(base, vals[i].c_str(), max_byte_array_len); + memcpy(base, vals[i].c_str(), vals[i].length()); values_[i].ptr = base; - values_[i].len = max_byte_array_len; + values_[i].len = static_cast(vals[i].length()); + base += vals[i].length(); } // Write String min/max values stats_[0] .set_min( - std::string(reinterpret_cast(vals[2].c_str()), max_byte_array_len)) + std::string(reinterpret_cast(vals[2].c_str()), vals[2].length())) .set_max(std::string(reinterpret_cast(vals[9].c_str()), - max_byte_array_len)); + vals[9].length())); } // TYPE::FLBAArray @@ -273,22 +274,23 @@ template <> void TestStatistics::SetValues() { size_t nbytes = NUM_VALUES * FLBA_LENGTH; values_buf_.resize(nbytes); - std::vector vals = {u8"b12345", u8"aü123456789", u8"c123", u8"d123", - u8"e123", u8"f123", u8"g123", u8"h123", - u8"üa123", u8"üa123456789"}; + char vals[NUM_VALUES][FLBA_LENGTH] = {"b12345", "a12345", "c12345", "d12345", + "e12345", "f12345", "g12345", "h12345", + "z12345", "a12345"}; + uint8_t* base = &values_buf_.data()[0]; for (int i = 0; i < NUM_VALUES; i++) { - uint8_t* base = &values_buf_.data()[0] + (i * FLBA_LENGTH); - memcpy(base, vals[i].c_str(), vals[i].length()); + memcpy(base, &vals[i][0], FLBA_LENGTH); values_[i].ptr = base; + base += FLBA_LENGTH; } // Write FLBA min,max values stats_[0] .set_min( - std::string(reinterpret_cast(vals[1].c_str()), vals[1].length())) + std::string(reinterpret_cast(&vals[1][0]), FLBA_LENGTH)) .set_max( - std::string(reinterpret_cast(vals[9].c_str()), vals[9].length())); + std::string(reinterpret_cast(&vals[8][0]), FLBA_LENGTH)); } TYPED_TEST_CASE(TestStatistics, TestTypes); diff --git a/src/parquet/parquet_version.h.in b/src/parquet/parquet_version.h.in index db8f396a..7036d2fa 100644 --- a/src/parquet/parquet_version.h.in +++ b/src/parquet/parquet_version.h.in @@ -21,4 +21,4 @@ // define the parquet created by version #define CREATED_BY_VERSION "parquet-cpp version @PARQUET_VERSION@" -#endif // PARQUET_VERSION_H +#endif // PARQUET_VERSION_H diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index a0423cb7..6364592d 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -25,6 +25,10 @@ #include "parquet/types.h" #include "parquet/util/comparison.h" +#if defined(_MSC_VER) +#pragma execution_character_set("utf-8") +#endif + namespace parquet { namespace test { @@ -78,13 +82,13 @@ TEST(Comparison, UnsignedByteArray) { ByteArray s2ba = ByteArrayFromString(s2); ASSERT_TRUE(uless(s1ba, s2ba)); + // Multi-byte UTF-8 characters s1 = u8"braten"; s2 = u8"bügeln"; s1ba = ByteArrayFromString(s1); s2ba = ByteArrayFromString(s2); ASSERT_TRUE(uless(s1ba, s2ba)); - // Multi-byte UTF-8 characters s1 = u8"ünk123456"; // ü = 252 s2 = u8"ănk123456"; // ă = 259 s1ba = ByteArrayFromString(s1); From 3cb8a3eff87a23531e6f66a65c189fed5cbb268e Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Thu, 10 Aug 2017 18:14:22 -0400 Subject: [PATCH 14/21] Review comments --- src/parquet/column_writer.cc | 6 ++---- src/parquet/file/metadata.cc | 3 +-- src/parquet/schema.h | 3 +++ src/parquet/statistics.cc | 7 +++++-- src/parquet/statistics.h | 9 ++++++++- src/parquet/util/comparison.cc | 5 ++--- 6 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index fef3676c..183f1513 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -268,8 +268,7 @@ int64_t ColumnWriter::Close() { EncodedStatistics chunk_statistics = GetChunkStatistics(); if (chunk_statistics.is_set()) - metadata_->SetStatistics(SortOrder::SIGNED == GetSortOrder(descr_->logical_type(), - descr_->physical_type()), + metadata_->SetStatistics(SortOrder::SIGNED == descr_->sort_order(), chunk_statistics); pager_->Close(has_dictionary_, fallback_); } @@ -321,8 +320,7 @@ TypedColumnWriter::TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata, } if (properties->statistics_enabled(descr_->path()) && - (SortOrder::UNKNOWN != - GetSortOrder(descr_->logical_type(), descr_->physical_type()))) { + (SortOrder::UNKNOWN != descr_->sort_order())) { page_statistics_ = std::unique_ptr(new TypedStats(descr_, allocator_)); chunk_statistics_ = std::unique_ptr(new TypedStats(descr_, allocator_)); } diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc index 9ed6e193..ef2c23c3 100644 --- a/src/parquet/file/metadata.cc +++ b/src/parquet/file/metadata.cc @@ -117,8 +117,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline bool is_stats_set() const { DCHECK(writer_version_ != nullptr); return column_->meta_data.__isset.statistics && - writer_version_->HasCorrectStatistics( - type(), GetSortOrder(descr_->logical_type(), descr_->physical_type())); + writer_version_->HasCorrectStatistics(type(), descr_->sort_order()); } inline std::shared_ptr statistics() const { diff --git a/src/parquet/schema.h b/src/parquet/schema.h index e240b827..9acd2891 100644 --- a/src/parquet/schema.h +++ b/src/parquet/schema.h @@ -332,6 +332,9 @@ class PARQUET_EXPORT ColumnDescriptor { LogicalType::type logical_type() const { return primitive_node_->logical_type(); } + SortOrder::type sort_order() const { return GetSortOrder(logical_type(), + physical_type()); } + const std::string& name() const { return primitive_node_->name(); } const std::shared_ptr path() const; diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc index ff5af791..b78205ae 100644 --- a/src/parquet/statistics.cc +++ b/src/parquet/statistics.cc @@ -34,7 +34,6 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const ColumnDescriptor* : pool_(pool), min_buffer_(AllocateBuffer(pool_, 0)), max_buffer_(AllocateBuffer(pool_, 0)) { - comparator_ = std::static_pointer_cast >(Compare::Make(schema)); SetDescr(schema); Reset(); } @@ -69,7 +68,6 @@ TypedRowGroupStatistics::TypedRowGroupStatistics( IncrementNullCount(null_count); IncrementDistinctCount(distinct_count); - comparator_ = std::static_pointer_cast >(Compare::Make(schema)); SetDescr(schema); if (!encoded_min.empty()) { @@ -86,6 +84,11 @@ bool TypedRowGroupStatistics::HasMinMax() const { return has_min_max_; } +template +void TypedRowGroupStatistics::SetComparator() { + comparator_ = std::static_pointer_cast >(Compare::Make(descr_)); +} + template void TypedRowGroupStatistics::Reset() { ResetCounts(); diff --git a/src/parquet/statistics.h b/src/parquet/statistics.h index 13ebfd3e..b5466c08 100644 --- a/src/parquet/statistics.h +++ b/src/parquet/statistics.h @@ -98,13 +98,19 @@ class PARQUET_EXPORT RowGroupStatistics virtual EncodedStatistics Encode() = 0; + // Set the Corresponding Comparator + virtual void SetComparator() = 0; + virtual ~RowGroupStatistics() {} Type::type physical_type() const { return descr_->physical_type(); } protected: const ColumnDescriptor* descr() const { return descr_; } - void SetDescr(const ColumnDescriptor* schema) { descr_ = schema; } + void SetDescr(const ColumnDescriptor* schema) { + descr_ = schema; + SetComparator(); + } void IncrementNullCount(int64_t n) { statistics_.null_count += n; } @@ -147,6 +153,7 @@ class TypedRowGroupStatistics : public RowGroupStatistics { bool HasMinMax() const override; void Reset() override; + void SetComparator() override; void Merge(const TypedRowGroupStatistics& other); void Update(const T* values, int64_t num_not_null, int64_t num_null); diff --git a/src/parquet/util/comparison.cc b/src/parquet/util/comparison.cc index 068d0338..86afcd80 100644 --- a/src/parquet/util/comparison.cc +++ b/src/parquet/util/comparison.cc @@ -25,7 +25,7 @@ namespace parquet { std::shared_ptr Compare::Make(const ColumnDescriptor* descr) { - if (SortOrder::SIGNED == GetSortOrder(descr->logical_type(), descr->physical_type())) { + if (SortOrder::SIGNED == descr->sort_order()) { switch (descr->physical_type()) { case Type::BOOLEAN: return std::make_shared(); @@ -46,8 +46,7 @@ std::shared_ptr Compare::Make(const ColumnDescriptor* descr) { default: ParquetException::NYI("Signed Compare not implemented"); } - } else if (SortOrder::UNSIGNED == - GetSortOrder(descr->logical_type(), descr->physical_type())) { + } else if (SortOrder::UNSIGNED == descr->sort_order()) { switch (descr->physical_type()) { case Type::INT32: return std::make_shared(); From f87bda60fa075ec883b76b4813c49317154dab88 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Fri, 11 Aug 2017 11:36:32 -0400 Subject: [PATCH 15/21] Avoid UTF-8 file encoding mismatch between windows and linux --- src/parquet/util/comparison-test.cc | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index 6364592d..f4fd567d 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -25,10 +25,6 @@ #include "parquet/types.h" #include "parquet/util/comparison.h" -#if defined(_MSC_VER) -#pragma execution_character_set("utf-8") -#endif - namespace parquet { namespace test { @@ -94,12 +90,6 @@ TEST(Comparison, UnsignedByteArray) { s1ba = ByteArrayFromString(s1); s2ba = ByteArrayFromString(s2); ASSERT_TRUE(uless(s1ba, s2ba)); - - s1 = u8"Ănk123456"; // Ă = 258 - s2 = u8"ănk123456"; // ă = 259 - s1ba = ByteArrayFromString(s1); - s2ba = ByteArrayFromString(s2); - ASSERT_TRUE(uless(s1ba, s2ba)); } TEST(Comparison, SignedFLBA) { From ba18ae64886d565c100f1a4e432e32fcd1f8a3e5 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 5 Sep 2017 11:42:35 -0400 Subject: [PATCH 16/21] Review comments --- src/parquet/column_writer.cc | 3 ++- src/parquet/file/metadata.h | 2 +- src/parquet/types.h | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc index 183f1513..ac7e2ba1 100644 --- a/src/parquet/column_writer.cc +++ b/src/parquet/column_writer.cc @@ -267,9 +267,10 @@ int64_t ColumnWriter::Close() { FlushBufferedDataPages(); EncodedStatistics chunk_statistics = GetChunkStatistics(); - if (chunk_statistics.is_set()) + if (chunk_statistics.is_set()) { metadata_->SetStatistics(SortOrder::SIGNED == descr_->sort_order(), chunk_statistics); + } pager_->Close(has_dictionary_, fallback_); } diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h index 0844e467..54ba4682 100644 --- a/src/parquet/file/metadata.h +++ b/src/parquet/file/metadata.h @@ -82,7 +82,7 @@ class ApplicationVersion { bool VersionEq(const ApplicationVersion& other_version) const; // Checks if the Version has the correct statistics for a given column - bool HasCorrectStatistics(Type::type primitive, SortOrder::type sort_order) const; + bool HasCorrectStatistics(Type::type primitive, SortOrder::type sort_order = SortOrder::SIGNED) const; }; class PARQUET_EXPORT ColumnChunkMetaData { diff --git a/src/parquet/types.h b/src/parquet/types.h index 3b1a0afe..af3a58f5 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -296,9 +296,9 @@ PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* PARQUET_EXPORT int GetTypeByteSize(Type::type t); -SortOrder::type PARQUET_EXPORT DefaultSortOrder(Type::type primitive); +PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive); -SortOrder::type PARQUET_EXPORT GetSortOrder(LogicalType::type converted, +PARQUET_EXPORT SortOrder::type GetSortOrder(LogicalType::type converted, Type::type primitive); } // namespace parquet From c5d961019217ad6ba5af74de752685b5b94d92f4 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 5 Sep 2017 13:35:52 -0400 Subject: [PATCH 17/21] Rename compare to comparator --- src/parquet/column_writer-test.cc | 2 +- src/parquet/parquet_reader_writer-test.cc | 19 ++++--------------- src/parquet/statistics.cc | 2 +- src/parquet/util/comparison-test.cc | 2 +- src/parquet/util/comparison.cc | 2 +- src/parquet/util/comparison.h | 14 +++++++------- 6 files changed, 15 insertions(+), 26 deletions(-) diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index fc3d7cc0..db733054 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -153,7 +153,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { this->ReadColumnFully(compression); std::shared_ptr> compare; compare = - std::static_pointer_cast>(Compare::Make(this->descr_)); + std::static_pointer_cast>(Comparator::Make(this->descr_)); for (size_t i = 0; i < this->values_.size(); i++) { if ((*compare)(this->values_[i], this->values_out_[i]) || (*compare)(this->values_out_[i], this->values_[i])) { diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index beccfb88..b5540a80 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -42,7 +42,10 @@ class TestStatistics : public ::testing::Test { public: typedef typename TestType::c_type T; - void AddNodes(std::string name); + void AddNodes(std::string name) { + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, TestType::type_num, + LogicalType::NONE)); + } void SetUpSchema() { stats_.resize(fields_.size()); @@ -188,13 +191,6 @@ void TestStatistics::SetValues() { } // TYPE::FLOAT -template <> -void TestStatistics::AddNodes(std::string name) { - // Float physical type has only Signed Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::FLOAT, - LogicalType::NONE)); -} - template <> void TestStatistics::SetValues() { for (int i = 0; i < NUM_VALUES; i++) { @@ -209,13 +205,6 @@ void TestStatistics::SetValues() { } // TYPE::DOUBLE -template <> -void TestStatistics::AddNodes(std::string name) { - // Double physical type has only Signed Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::DOUBLE, - LogicalType::NONE)); -} - template <> void TestStatistics::SetValues() { for (int i = 0; i < NUM_VALUES; i++) { diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc index b78205ae..4624614e 100644 --- a/src/parquet/statistics.cc +++ b/src/parquet/statistics.cc @@ -86,7 +86,7 @@ bool TypedRowGroupStatistics::HasMinMax() const { template void TypedRowGroupStatistics::SetComparator() { - comparator_ = std::static_pointer_cast >(Compare::Make(descr_)); + comparator_ = std::static_pointer_cast >(Comparator::Make(descr_)); } template diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index f4fd567d..e5f2f683 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -213,7 +213,7 @@ TEST(Comparison, UnknownSortOrder) { LogicalType::INTERVAL, 12); ColumnDescriptor descr(node, 0, 0); - ASSERT_THROW(Compare::Make(&descr), ParquetException); + ASSERT_THROW(Comparator::Make(&descr), ParquetException); } } // namespace test diff --git a/src/parquet/util/comparison.cc b/src/parquet/util/comparison.cc index 86afcd80..1d7bb9dc 100644 --- a/src/parquet/util/comparison.cc +++ b/src/parquet/util/comparison.cc @@ -24,7 +24,7 @@ namespace parquet { -std::shared_ptr Compare::Make(const ColumnDescriptor* descr) { +std::shared_ptr Comparator::Make(const ColumnDescriptor* descr) { if (SortOrder::SIGNED == descr->sort_order()) { switch (descr->physical_type()) { case Type::BOOLEAN: diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h index 7bef66d2..803f0da6 100644 --- a/src/parquet/util/comparison.h +++ b/src/parquet/util/comparison.h @@ -26,15 +26,15 @@ namespace parquet { -class PARQUET_EXPORT Compare { +class PARQUET_EXPORT Comparator { public: - virtual ~Compare() {} - static std::shared_ptr Make(const ColumnDescriptor* descr); + virtual ~Comparator() {} + static std::shared_ptr Make(const ColumnDescriptor* descr); }; // The default comparison is SIGNED template -class PARQUET_EXPORT CompareDefault : public Compare { +class PARQUET_EXPORT CompareDefault : public Comparator { public: typedef typename DType::c_type T; CompareDefault() {} @@ -43,7 +43,7 @@ class PARQUET_EXPORT CompareDefault : public Compare { }; template <> -class PARQUET_EXPORT CompareDefault : public Compare { +class PARQUET_EXPORT CompareDefault : public Comparator { public: CompareDefault() {} virtual ~CompareDefault() {} @@ -55,7 +55,7 @@ class PARQUET_EXPORT CompareDefault : public Compare { }; template <> -class PARQUET_EXPORT CompareDefault : public Compare { +class PARQUET_EXPORT CompareDefault : public Comparator { public: CompareDefault() {} virtual ~CompareDefault() {} @@ -67,7 +67,7 @@ class PARQUET_EXPORT CompareDefault : public Compare { }; template <> -class PARQUET_EXPORT CompareDefault : public Compare { +class PARQUET_EXPORT CompareDefault : public Comparator { public: explicit CompareDefault(int length) : type_length_(length) {} virtual ~CompareDefault() {} From 75ea475801a17056bd33503f6bc8c0f1221fb932 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Tue, 5 Sep 2017 13:52:51 -0400 Subject: [PATCH 18/21] Fix formatting --- src/parquet/column_writer-test.cc | 4 ++-- src/parquet/file/metadata.h | 3 ++- src/parquet/parquet_reader_writer-test.cc | 21 +++++++++------------ src/parquet/parquet_version.h.in | 2 +- src/parquet/schema.h | 5 +++-- src/parquet/statistics.cc | 3 ++- src/parquet/util/comparison-test.cc | 4 ++-- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index db733054..48f243ea 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -152,8 +152,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { this->SetupValuesOut(num_rows); this->ReadColumnFully(compression); std::shared_ptr> compare; - compare = - std::static_pointer_cast>(Comparator::Make(this->descr_)); + compare = std::static_pointer_cast>( + Comparator::Make(this->descr_)); for (size_t i = 0; i < this->values_.size(); i++) { if ((*compare)(this->values_[i], this->values_out_[i]) || (*compare)(this->values_out_[i], this->values_[i])) { diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h index 54ba4682..0d8e10ef 100644 --- a/src/parquet/file/metadata.h +++ b/src/parquet/file/metadata.h @@ -82,7 +82,8 @@ class ApplicationVersion { bool VersionEq(const ApplicationVersion& other_version) const; // Checks if the Version has the correct statistics for a given column - bool HasCorrectStatistics(Type::type primitive, SortOrder::type sort_order = SortOrder::SIGNED) const; + bool HasCorrectStatistics(Type::type primitive, + SortOrder::type sort_order = SortOrder::SIGNED) const; }; class PARQUET_EXPORT ColumnChunkMetaData { diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc index b5540a80..a2fde651 100644 --- a/src/parquet/parquet_reader_writer-test.cc +++ b/src/parquet/parquet_reader_writer-test.cc @@ -43,8 +43,8 @@ class TestStatistics : public ::testing::Test { typedef typename TestType::c_type T; void AddNodes(std::string name) { - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, TestType::type_num, - LogicalType::NONE)); + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, + TestType::type_num, LogicalType::NONE)); } void SetUpSchema() { @@ -232,7 +232,7 @@ void TestStatistics::SetValues() { size_t nbytes = NUM_VALUES * max_byte_array_len; values_buf_.resize(nbytes); std::vector vals = {u8"c123", u8"b123", u8"a123", u8"d123", u8"e123", - u8"f123", u8"g123", u8"h123", u8"i123", u8"ü123"}; + u8"f123", u8"g123", u8"h123", u8"i123", u8"ü123"}; uint8_t* base = &values_buf_.data()[0]; for (int i = 0; i < NUM_VALUES; i++) { @@ -246,8 +246,8 @@ void TestStatistics::SetValues() { stats_[0] .set_min( std::string(reinterpret_cast(vals[2].c_str()), vals[2].length())) - .set_max(std::string(reinterpret_cast(vals[9].c_str()), - vals[9].length())); + .set_max( + std::string(reinterpret_cast(vals[9].c_str()), vals[9].length())); } // TYPE::FLBAArray @@ -263,9 +263,8 @@ template <> void TestStatistics::SetValues() { size_t nbytes = NUM_VALUES * FLBA_LENGTH; values_buf_.resize(nbytes); - char vals[NUM_VALUES][FLBA_LENGTH] = {"b12345", "a12345", "c12345", "d12345", - "e12345", "f12345", "g12345", "h12345", - "z12345", "a12345"}; + char vals[NUM_VALUES][FLBA_LENGTH] = {"b12345", "a12345", "c12345", "d12345", "e12345", + "f12345", "g12345", "h12345", "z12345", "a12345"}; uint8_t* base = &values_buf_.data()[0]; for (int i = 0; i < NUM_VALUES; i++) { @@ -276,10 +275,8 @@ void TestStatistics::SetValues() { // Write FLBA min,max values stats_[0] - .set_min( - std::string(reinterpret_cast(&vals[1][0]), FLBA_LENGTH)) - .set_max( - std::string(reinterpret_cast(&vals[8][0]), FLBA_LENGTH)); + .set_min(std::string(reinterpret_cast(&vals[1][0]), FLBA_LENGTH)) + .set_max(std::string(reinterpret_cast(&vals[8][0]), FLBA_LENGTH)); } TYPED_TEST_CASE(TestStatistics, TestTypes); diff --git a/src/parquet/parquet_version.h.in b/src/parquet/parquet_version.h.in index 7036d2fa..db8f396a 100644 --- a/src/parquet/parquet_version.h.in +++ b/src/parquet/parquet_version.h.in @@ -21,4 +21,4 @@ // define the parquet created by version #define CREATED_BY_VERSION "parquet-cpp version @PARQUET_VERSION@" -#endif // PARQUET_VERSION_H +#endif // PARQUET_VERSION_H diff --git a/src/parquet/schema.h b/src/parquet/schema.h index 9acd2891..c6b7fbec 100644 --- a/src/parquet/schema.h +++ b/src/parquet/schema.h @@ -332,8 +332,9 @@ class PARQUET_EXPORT ColumnDescriptor { LogicalType::type logical_type() const { return primitive_node_->logical_type(); } - SortOrder::type sort_order() const { return GetSortOrder(logical_type(), - physical_type()); } + SortOrder::type sort_order() const { + return GetSortOrder(logical_type(), physical_type()); + } const std::string& name() const { return primitive_node_->name(); } diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc index 4624614e..dad1a9bf 100644 --- a/src/parquet/statistics.cc +++ b/src/parquet/statistics.cc @@ -86,7 +86,8 @@ bool TypedRowGroupStatistics::HasMinMax() const { template void TypedRowGroupStatistics::SetComparator() { - comparator_ = std::static_pointer_cast >(Comparator::Make(descr_)); + comparator_ = + std::static_pointer_cast >(Comparator::Make(descr_)); } template diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc index e5f2f683..dc915bfe 100644 --- a/src/parquet/util/comparison-test.cc +++ b/src/parquet/util/comparison-test.cc @@ -85,8 +85,8 @@ TEST(Comparison, UnsignedByteArray) { s2ba = ByteArrayFromString(s2); ASSERT_TRUE(uless(s1ba, s2ba)); - s1 = u8"ünk123456"; // ü = 252 - s2 = u8"ănk123456"; // ă = 259 + s1 = u8"ünk123456"; // ü = 252 + s2 = u8"ănk123456"; // ă = 259 s1ba = ByteArrayFromString(s1); s2ba = ByteArrayFromString(s2); ASSERT_TRUE(uless(s1ba, s2ba)); From 712ea9073c4292e918d4d067d7115379553ac95a Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Sun, 10 Sep 2017 08:35:16 -0400 Subject: [PATCH 19/21] Move test to statistics-test --- src/parquet/CMakeLists.txt | 1 - src/parquet/parquet_reader_writer-test.cc | 316 ---------------------- src/parquet/statistics-test.cc | 281 +++++++++++++++++++ 3 files changed, 281 insertions(+), 317 deletions(-) delete mode 100644 src/parquet/parquet_reader_writer-test.cc diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt index d10759bc..a2e283e2 100644 --- a/src/parquet/CMakeLists.txt +++ b/src/parquet/CMakeLists.txt @@ -52,7 +52,6 @@ ADD_PARQUET_TEST(column_writer-test) ADD_PARQUET_TEST(properties-test) ADD_PARQUET_TEST(statistics-test) ADD_PARQUET_TEST(encoding-test) -ADD_PARQUET_TEST(parquet_reader_writer-test) ADD_PARQUET_TEST(public-api-test) ADD_PARQUET_TEST(types-test) ADD_PARQUET_TEST(reader-test) diff --git a/src/parquet/parquet_reader_writer-test.cc b/src/parquet/parquet_reader_writer-test.cc deleted file mode 100644 index a2fde651..00000000 --- a/src/parquet/parquet_reader_writer-test.cc +++ /dev/null @@ -1,316 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include - -#include -#include - -#include - -namespace parquet { - -using parquet::Repetition; -using parquet::Type; -using parquet::LogicalType; -using parquet::schema::PrimitiveNode; -using parquet::schema::GroupNode; - -static const int FLBA_LENGTH = 12; -static const int NUM_VALUES = 10; - -namespace test { - -template -class TestStatistics : public ::testing::Test { - public: - typedef typename TestType::c_type T; - - void AddNodes(std::string name) { - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, - TestType::type_num, LogicalType::NONE)); - } - - void SetUpSchema() { - stats_.resize(fields_.size()); - values_.resize(NUM_VALUES); - schema_ = std::static_pointer_cast( - GroupNode::Make("Schema", Repetition::REQUIRED, fields_)); - - parquet_sink_ = std::make_shared(); - } - - void SetValues(); - - void WriteParquet() { - // Add writer properties - parquet::WriterProperties::Builder builder; - builder.compression(parquet::Compression::SNAPPY); - builder.created_by("parquet-cpp version 1.3.0"); - std::shared_ptr props = builder.build(); - - // Create a ParquetFileWriter instance - auto file_writer = parquet::ParquetFileWriter::Open(parquet_sink_, schema_, props); - - // Append a RowGroup with a specific number of rows. - auto rg_writer = file_writer->AppendRowGroup(NUM_VALUES); - - this->SetValues(); - - // Insert Values - for (int i = 0; i < static_cast(fields_.size()); i++) { - auto column_writer = - static_cast*>(rg_writer->NextColumn()); - column_writer->WriteBatch(NUM_VALUES, nullptr, nullptr, values_.data()); - } - } - - void VerifyParquetStats() { - auto pbuffer = parquet_sink_->GetBuffer(); - - // Create a ParquetReader instance - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::Open( - std::make_shared(pbuffer)); - - // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - std::shared_ptr rg_metadata = file_metadata->RowGroup(0); - for (int i = 0; i < static_cast(fields_.size()); i++) { - std::shared_ptr cc_metadata = - rg_metadata->ColumnChunk(i); - ASSERT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); - ASSERT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); - } - } - - protected: - std::vector values_; - std::vector values_buf_; - std::vector fields_; - std::shared_ptr schema_; - std::shared_ptr parquet_sink_; - std::vector stats_; -}; - -using TestTypes = ::testing::Types; - -// TYPE::INT32 -template <> -void TestStatistics::AddNodes(std::string name) { - // UINT_32 logical type to set Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, - LogicalType::UINT_32)); - // INT_32 logical type to set Signed Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, - LogicalType::INT_32)); -} - -template <> -void TestStatistics::SetValues() { - for (int i = 0; i < NUM_VALUES; i++) { - values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; - } - - // Write UINT32 min/max values - stats_[0] - .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); - - // Write INT32 min/max values - stats_[1] - .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); -} - -// TYPE::INT64 -template <> -void TestStatistics::AddNodes(std::string name) { - // UINT_64 logical type to set Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, - LogicalType::UINT_64)); - // INT_64 logical type to set Signed Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, - LogicalType::INT_64)); -} - -template <> -void TestStatistics::SetValues() { - for (int i = 0; i < NUM_VALUES; i++) { - values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; - } - - // Write UINT64 min/max values - stats_[0] - .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); - - // Write INT64 min/max values - stats_[1] - .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); -} - -// TYPE::INT96 -template <> -void TestStatistics::AddNodes(std::string name) { - // INT96 physical type has only Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT96, - LogicalType::NONE)); -} - -template <> -void TestStatistics::SetValues() { - for (int i = 0; i < NUM_VALUES; i++) { - values_[i].value[0] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; - values_[i].value[1] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; - values_[i].value[2] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; - } - - // Write Int96 min/max values - stats_[0] - .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); -} - -// TYPE::FLOAT -template <> -void TestStatistics::SetValues() { - for (int i = 0; i < NUM_VALUES; i++) { - values_[i] = - (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; - } - - // Write Float min/max values - stats_[0] - .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); -} - -// TYPE::DOUBLE -template <> -void TestStatistics::SetValues() { - for (int i = 0; i < NUM_VALUES; i++) { - values_[i] = - (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; - } - - // Write Double min/max values - stats_[0] - .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) - .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); -} - -// TYPE::ByteArray -template <> -void TestStatistics::AddNodes(std::string name) { - // UTF8 logical type to set Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, - Type::BYTE_ARRAY, LogicalType::UTF8)); -} - -template <> -void TestStatistics::SetValues() { - int max_byte_array_len = 10; - size_t nbytes = NUM_VALUES * max_byte_array_len; - values_buf_.resize(nbytes); - std::vector vals = {u8"c123", u8"b123", u8"a123", u8"d123", u8"e123", - u8"f123", u8"g123", u8"h123", u8"i123", u8"ü123"}; - - uint8_t* base = &values_buf_.data()[0]; - for (int i = 0; i < NUM_VALUES; i++) { - memcpy(base, vals[i].c_str(), vals[i].length()); - values_[i].ptr = base; - values_[i].len = static_cast(vals[i].length()); - base += vals[i].length(); - } - - // Write String min/max values - stats_[0] - .set_min( - std::string(reinterpret_cast(vals[2].c_str()), vals[2].length())) - .set_max( - std::string(reinterpret_cast(vals[9].c_str()), vals[9].length())); -} - -// TYPE::FLBAArray -template <> -void TestStatistics::AddNodes(std::string name) { - // FLBA has only Unsigned Statistics - fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, FLBA_LENGTH)); -} - -template <> -void TestStatistics::SetValues() { - size_t nbytes = NUM_VALUES * FLBA_LENGTH; - values_buf_.resize(nbytes); - char vals[NUM_VALUES][FLBA_LENGTH] = {"b12345", "a12345", "c12345", "d12345", "e12345", - "f12345", "g12345", "h12345", "z12345", "a12345"}; - - uint8_t* base = &values_buf_.data()[0]; - for (int i = 0; i < NUM_VALUES; i++) { - memcpy(base, &vals[i][0], FLBA_LENGTH); - values_[i].ptr = base; - base += FLBA_LENGTH; - } - - // Write FLBA min,max values - stats_[0] - .set_min(std::string(reinterpret_cast(&vals[1][0]), FLBA_LENGTH)) - .set_max(std::string(reinterpret_cast(&vals[8][0]), FLBA_LENGTH)); -} - -TYPED_TEST_CASE(TestStatistics, TestTypes); - -TYPED_TEST(TestStatistics, MinMax) { - this->AddNodes("Column "); - this->SetUpSchema(); - this->WriteParquet(); - this->VerifyParquetStats(); -} - -// Ensure UNKNOWN sort order is handled properly -using TestStatisticsFLBA = TestStatistics; - -TEST_F(TestStatisticsFLBA, UnknownSortOrder) { - this->fields_.push_back(schema::PrimitiveNode::Make( - "Column 0", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, - FLBA_LENGTH)); - this->SetUpSchema(); - this->WriteParquet(); - - auto pbuffer = parquet_sink_->GetBuffer(); - // Create a ParquetReader instance - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::Open( - std::make_shared(pbuffer)); - // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - std::shared_ptr rg_metadata = file_metadata->RowGroup(0); - std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(0); - - // stats should not be set for UNKNOWN sort order - ASSERT_FALSE(cc_metadata->is_stats_set()); -} - -} // namespace test -} // namespace parquet diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc index 44cc1cce..0ef2e798 100644 --- a/src/parquet/statistics-test.cc +++ b/src/parquet/statistics-test.cc @@ -402,5 +402,286 @@ TEST(CorrectStatistics, Basics) { ASSERT_TRUE(column_chunk6->is_stats_set()); } +// Test SortOrder class +static const int NUM_VALUES = 10; + +template +class TestStatistics : public ::testing::Test { + public: + typedef typename TestType::c_type T; + + void AddNodes(std::string name) { + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, + TestType::type_num, LogicalType::NONE)); + } + + void SetUpSchema() { + stats_.resize(fields_.size()); + values_.resize(NUM_VALUES); + schema_ = std::static_pointer_cast( + GroupNode::Make("Schema", Repetition::REQUIRED, fields_)); + + parquet_sink_ = std::make_shared(); + } + + void SetValues(); + + void WriteParquet() { + // Add writer properties + parquet::WriterProperties::Builder builder; + builder.compression(parquet::Compression::SNAPPY); + builder.created_by("parquet-cpp version 1.3.0"); + std::shared_ptr props = builder.build(); + + // Create a ParquetFileWriter instance + auto file_writer = parquet::ParquetFileWriter::Open(parquet_sink_, schema_, props); + + // Append a RowGroup with a specific number of rows. + auto rg_writer = file_writer->AppendRowGroup(NUM_VALUES); + + this->SetValues(); + + // Insert Values + for (int i = 0; i < static_cast(fields_.size()); i++) { + auto column_writer = + static_cast*>(rg_writer->NextColumn()); + column_writer->WriteBatch(NUM_VALUES, nullptr, nullptr, values_.data()); + } + } + + void VerifyParquetStats() { + auto pbuffer = parquet_sink_->GetBuffer(); + + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::Open( + std::make_shared(pbuffer)); + + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + std::shared_ptr rg_metadata = file_metadata->RowGroup(0); + for (int i = 0; i < static_cast(fields_.size()); i++) { + std::shared_ptr cc_metadata = + rg_metadata->ColumnChunk(i); + ASSERT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); + ASSERT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + } + } + + protected: + std::vector values_; + std::vector values_buf_; + std::vector fields_; + std::shared_ptr schema_; + std::shared_ptr parquet_sink_; + std::vector stats_; +}; + +using CompareTestTypes = ::testing::Types; + +// TYPE::INT32 +template <> +void TestStatistics::AddNodes(std::string name) { + // UINT_32 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, + LogicalType::UINT_32)); + // INT_32 logical type to set Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT32, + LogicalType::INT_32)); +} + +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + } + + // Write UINT32 min/max values + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); + + // Write INT32 min/max values + stats_[1] + .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); +} + +// TYPE::INT64 +template <> +void TestStatistics::AddNodes(std::string name) { + // UINT_64 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, + LogicalType::UINT_64)); + // INT_64 logical type to set Signed Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT64, + LogicalType::INT_64)); +} + +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + } + + // Write UINT64 min/max values + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); + + // Write INT64 min/max values + stats_[1] + .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); +} + +// TYPE::INT96 +template <> +void TestStatistics::AddNodes(std::string name) { + // INT96 physical type has only Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, Type::INT96, + LogicalType::NONE)); +} + +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i].value[0] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i].value[1] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + values_[i].value[2] = i - 5; // {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}; + } + + // Write Int96 min/max values + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[5]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[4]), sizeof(T))); +} + +// TYPE::FLOAT +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i] = + (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; + } + + // Write Float min/max values + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); +} + +// TYPE::DOUBLE +template <> +void TestStatistics::SetValues() { + for (int i = 0; i < NUM_VALUES; i++) { + values_[i] = + (i * 1.0f) - 5; // {-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; + } + + // Write Double min/max values + stats_[0] + .set_min(std::string(reinterpret_cast(&values_[0]), sizeof(T))) + .set_max(std::string(reinterpret_cast(&values_[9]), sizeof(T))); +} + +// TYPE::ByteArray +template <> +void TestStatistics::AddNodes(std::string name) { + // UTF8 logical type to set Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, + Type::BYTE_ARRAY, LogicalType::UTF8)); +} + +template <> +void TestStatistics::SetValues() { + int max_byte_array_len = 10; + size_t nbytes = NUM_VALUES * max_byte_array_len; + values_buf_.resize(nbytes); + std::vector vals = {u8"c123", u8"b123", u8"a123", u8"d123", u8"e123", + u8"f123", u8"g123", u8"h123", u8"i123", u8"ü123"}; + + uint8_t* base = &values_buf_.data()[0]; + for (int i = 0; i < NUM_VALUES; i++) { + memcpy(base, vals[i].c_str(), vals[i].length()); + values_[i].ptr = base; + values_[i].len = static_cast(vals[i].length()); + base += vals[i].length(); + } + + // Write String min/max values + stats_[0] + .set_min( + std::string(reinterpret_cast(vals[2].c_str()), vals[2].length())) + .set_max( + std::string(reinterpret_cast(vals[9].c_str()), vals[9].length())); +} + +// TYPE::FLBAArray +template <> +void TestStatistics::AddNodes(std::string name) { + // FLBA has only Unsigned Statistics + fields_.push_back(schema::PrimitiveNode::Make(name, Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, FLBA_LENGTH)); +} + +template <> +void TestStatistics::SetValues() { + size_t nbytes = NUM_VALUES * FLBA_LENGTH; + values_buf_.resize(nbytes); + char vals[NUM_VALUES][FLBA_LENGTH] = {"b12345", "a12345", "c12345", "d12345", "e12345", + "f12345", "g12345", "h12345", "z12345", "a12345"}; + + uint8_t* base = &values_buf_.data()[0]; + for (int i = 0; i < NUM_VALUES; i++) { + memcpy(base, &vals[i][0], FLBA_LENGTH); + values_[i].ptr = base; + base += FLBA_LENGTH; + } + + // Write FLBA min,max values + stats_[0] + .set_min(std::string(reinterpret_cast(&vals[1][0]), FLBA_LENGTH)) + .set_max(std::string(reinterpret_cast(&vals[8][0]), FLBA_LENGTH)); +} + +TYPED_TEST_CASE(TestStatistics, CompareTestTypes); + +TYPED_TEST(TestStatistics, MinMax) { + this->AddNodes("Column "); + this->SetUpSchema(); + this->WriteParquet(); + this->VerifyParquetStats(); +} + +// Ensure UNKNOWN sort order is handled properly +using TestStatisticsFLBA = TestStatistics; + +TEST_F(TestStatisticsFLBA, UnknownSortOrder) { + this->fields_.push_back(schema::PrimitiveNode::Make( + "Column 0", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, + FLBA_LENGTH)); + this->SetUpSchema(); + this->WriteParquet(); + + auto pbuffer = parquet_sink_->GetBuffer(); + // Create a ParquetReader instance + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::Open( + std::make_shared(pbuffer)); + // Get the File MetaData + std::shared_ptr file_metadata = parquet_reader->metadata(); + std::shared_ptr rg_metadata = file_metadata->RowGroup(0); + std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(0); + + // stats should not be set for UNKNOWN sort order + ASSERT_FALSE(cc_metadata->is_stats_set()); +} + + + + } // namespace test } // namespace parquet From 48dd22d19b1c88e722613c0928ae58e395dc8b7a Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Sun, 10 Sep 2017 08:38:24 -0400 Subject: [PATCH 20/21] change fix version from 1.2.1 to 1.3.0 --- src/parquet/file/metadata.cc | 4 ++-- src/parquet/statistics-test.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc index ef2c23c3..6e7fc3bc 100644 --- a/src/parquet/file/metadata.cc +++ b/src/parquet/file/metadata.cc @@ -36,7 +36,7 @@ const ApplicationVersion ApplicationVersion::PARQUET_251_FIXED_VERSION = const ApplicationVersion ApplicationVersion::PARQUET_816_FIXED_VERSION = ApplicationVersion("parquet-mr version 1.2.9"); const ApplicationVersion ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION = - ApplicationVersion("parquet-cpp version 1.2.1"); + ApplicationVersion("parquet-cpp version 1.3.0"); template static std::shared_ptr MakeTypedColumnStats( @@ -492,7 +492,7 @@ bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) cons // PARQUET-686 has more disussion on statistics bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, SortOrder::type sort_order) const { - // Parquet cpp version 1.2.1 onwards stats are computed correctly for all types + // Parquet cpp version 1.3.0 onwards stats are computed correctly for all types if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION))) { // Only SIGNED are valid if (SortOrder::SIGNED != sort_order) return false; diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc index 0ef2e798..d6c52057 100644 --- a/src/parquet/statistics-test.cc +++ b/src/parquet/statistics-test.cc @@ -358,7 +358,7 @@ TEST(CorruptStatistics, Basics) { // Statistics for all types have no restrictions in newer parquet version TEST(CorrectStatistics, Basics) { - ApplicationVersion version("parquet-cpp version 1.2.1"); + ApplicationVersion version("parquet-cpp version 1.3.0"); SchemaDescriptor schema; schema::NodePtr node; std::vector fields; From 5a93fe37595616f6a97766b779b0ae56efe04ff8 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Mon, 11 Sep 2017 09:08:36 -0400 Subject: [PATCH 21/21] fix error --- src/parquet/test-util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parquet/test-util.h b/src/parquet/test-util.h index 356486b2..3fd72f2a 100644 --- a/src/parquet/test-util.h +++ b/src/parquet/test-util.h @@ -42,7 +42,7 @@ using std::shared_ptr; namespace parquet { -static int FLBA_LENGTH = 12; +static constexpr int FLBA_LENGTH = 12; bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) { return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);