From 1cf9d9e93c95887724b1f7ccf04c9c90ae77eb65 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 1 Dec 2022 22:54:14 +0800 Subject: [PATCH 01/19] ARROW-18420: [C++][Parquet] Introduce ColumnIndex & OffsetIndex --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/page_index.cc | 266 +++++++++++++++++++++++++++++++++ cpp/src/parquet/page_index.h | 143 ++++++++++++++++++ 3 files changed, 410 insertions(+) create mode 100644 cpp/src/parquet/page_index.cc create mode 100644 cpp/src/parquet/page_index.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 17e6fcda721..4f31e9ef4d1 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -166,6 +166,7 @@ set(PARQUET_SRCS level_conversion.cc metadata.cc murmur3.cc + page_index.cc "${ARROW_SOURCE_DIR}/src/generated/parquet_constants.cpp" "${ARROW_SOURCE_DIR}/src/generated/parquet_types.cpp" platform.cc diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc new file mode 100644 index 00000000000..a3b57073064 --- /dev/null +++ b/cpp/src/parquet/page_index.cc @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/metadata.h" +#include "parquet/thrift_internal.h" + +#include + +namespace parquet { + +namespace { + +template +void PlainDecode(const ColumnDescriptor* descr, const std::string& src, + typename DType::c_type* dst) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr); + decoder->SetData(1, reinterpret_cast(src.c_str()), + static_cast(src.size())); + decoder->Decode(dst, 1); +} + +template <> +void PlainDecode(const ColumnDescriptor* descr, const std::string& src, + ByteArray* dst) { + dst->len = static_cast(src.size()); + dst->ptr = reinterpret_cast(src.c_str()); +} + +template +class TypedColumnIndexImpl : public TypedColumnIndex { + public: + using T = typename DType::c_type; + + explicit TypedColumnIndexImpl(const ColumnDescriptor* descr, + const std::vector& null_pages, + const std::vector& min_values, + const std::vector& max_values, + const BoundaryOrder& boundary_order, + const bool has_null_count = false, + const std::vector& null_counts = {}) + : descr_(descr), + null_pages_(null_pages), + encoded_min_values_(min_values), + encoded_max_values_(max_values), + boundary_order_(boundary_order), + has_null_count_(has_null_count), + null_counts_(null_counts) { + /// Decode min and max values into a compact form (i.e. w/o null page) + DecodeValues(encoded_min_values_, encoded_max_values_); + } + + explicit TypedColumnIndexImpl(const ColumnDescriptor* descr, + const format::ColumnIndex& column_index) + : TypedColumnIndexImpl( + descr, column_index.null_pages, column_index.min_values, + column_index.max_values, + static_cast(static_cast(column_index.boundary_order)), + column_index.__isset.null_counts, column_index.null_counts) {} + + int64_t num_pages() const override { return static_cast(null_pages_.size()); } + + bool null_page(int64_t page_id) const override { + if (page_id >= num_pages()) { + throw ParquetException("Page index is out of bound"); + } + return null_pages_[page_id]; + } + + BoundaryOrder boundary_order() const override { return boundary_order_; } + + bool HasNullCount() const override { return has_null_count_; } + + int64_t null_count(int64_t page_id) const override { + if (page_id >= num_pages()) { + throw ParquetException("Page index is out of bound"); + } + return null_counts_[page_id]; + } + + T min_value(int64_t page_id) const override { + return min_values_[GetMinMaxSlot(page_id)]; + } + + T max_value(int64_t page_id) const override { + return max_values_[GetMinMaxSlot(page_id)]; + } + + std::string GetEncodedMin(int64_t page_id) const override { + return encoded_min_values_[page_id]; + } + + std::string GetEncodedMax(int64_t page_id) const override { + return encoded_max_values_[page_id]; + } + + const std::vector& GetNullPages() const override { return null_pages_; } + + const std::vector& GetNullCounts() const override { return null_counts_; } + + const std::vector& GetMinValues() const override { return min_values_; } + + const std::vector& GetMaxValues() const override { return max_values_; } + + std::vector GetValidPageIndices() const override { + std::vector valid_page_indices; + std::for_each(page_indexes_.cbegin(), page_indexes_.cend(), + [&](const std::pair& v) { + valid_page_indices.push_back(v.first); + }); + return valid_page_indices; + } + + private: + size_t GetMinMaxSlot(int64_t page_id) const { + if (page_id >= static_cast(null_pages_.size())) { + throw ParquetException("page index is out of bound"); + } + if (null_pages_[page_id]) { + throw ParquetException("cannot get min/max value of null page"); + } + auto iter = page_indexes_.find(page_id); + if (iter == page_indexes_.cend()) { + throw ParquetException("min/max value is unavailable"); + } + return iter->second; + } + + void DecodeValues(const std::vector& min_values, + const std::vector& max_values) { + T value; + for (size_t i = 0; i < null_pages_.size(); ++i) { + if (!null_pages_[i]) { + // page index -> min/max slot index + page_indexes_.emplace(i, min_values_.size()); + + PlainDecode(descr_, min_values[i], &value); + min_values_.push_back(value); + PlainDecode(descr_, max_values[i], &value); + max_values_.push_back(value); + } + } + } + + const ColumnDescriptor* descr_; + std::vector null_pages_; + std::vector encoded_min_values_; + std::vector encoded_max_values_; + /// page_id -> slot_id in the buffer of min_values_ & max_values_ + std::map page_indexes_; + std::vector min_values_; + std::vector max_values_; + BoundaryOrder boundary_order_; + bool has_null_count_; + std::vector null_counts_; +}; + +class OffsetIndexImpl : public OffsetIndex { + public: + explicit OffsetIndexImpl(std::vector page_locations) + : page_locations_(std::move(page_locations)) {} + + explicit OffsetIndexImpl(const format::OffsetIndex& offset_index) { + for (const auto& page_location : offset_index.page_locations) { + page_locations_.emplace_back(); + auto& location = page_locations_.back(); + location.offset_ = page_location.offset; + location.compressed_page_size_ = page_location.compressed_page_size; + location.first_row_index_ = page_location.first_row_index; + } + } + + int64_t num_pages() const override { return page_locations_.size(); } + + int64_t offset(int64_t page_id) const override { + if (page_id >= num_pages()) { + throw ParquetException("Page index is out of bound"); + } + return page_locations_[page_id].offset_; + } + + int32_t compressed_page_size(int64_t page_id) const override { + if (page_id >= num_pages()) { + throw ParquetException("Page index is out of bound"); + } + return page_locations_[page_id].compressed_page_size_; + } + + int64_t first_row_index(int64_t page_id) const override { + if (page_id >= num_pages()) { + throw ParquetException("Page index is out of bound"); + } + return page_locations_[page_id].first_row_index_; + } + + const std::vector& GetPageLocations() const override { + return page_locations_; + } + + private: + std::vector page_locations_; +}; + +} // namespace + +// ---------------------------------------------------------------------- +// Public factory functions + +std::unique_ptr ColumnIndex::Make(const ColumnDescriptor* descr, + const void* serialized_index, + uint32_t* inout_index_len, + const ReaderProperties& properties) { + format::ColumnIndex column_index; + ThriftDeserializer deserializer(properties); + deserializer.DeserializeMessage(reinterpret_cast(serialized_index), + inout_index_len, &column_index); + switch (descr->physical_type()) { + case Type::BOOLEAN: + return std::make_unique>(descr, column_index); + case Type::INT32: + return std::make_unique>(descr, column_index); + case Type::INT64: + return std::make_unique>(descr, column_index); + case Type::INT96: + return std::make_unique>(descr, column_index); + case Type::FLOAT: + return std::make_unique>(descr, column_index); + case Type::DOUBLE: + return std::make_unique>(descr, column_index); + case Type::BYTE_ARRAY: + return std::make_unique>(descr, column_index); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_unique>(descr, column_index); + default: + break; + } + DCHECK(false) << "Should not be able to reach this code"; + return nullptr; +} + +std::unique_ptr OffsetIndex::Make(const void* serialized_index, + uint32_t* inout_index_len, + const ReaderProperties& properties) { + format::OffsetIndex offset_index; + ThriftDeserializer deserializer(properties); + deserializer.DeserializeMessage(reinterpret_cast(serialized_index), + inout_index_len, &offset_index); + return std::make_unique(offset_index); +} + +} // namespace parquet diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h new file mode 100644 index 00000000000..162c617126e --- /dev/null +++ b/cpp/src/parquet/page_index.h @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "parquet/platform.h" +#include "parquet/statistics.h" +#include "parquet/types.h" + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { UNORDERED = 0, ASCENDING = 1, DESCENDING = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr Make(const ColumnDescriptor* descr, + const void* serialized_index, + uint32_t* inout_index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns number of pages in this column index. + virtual int64_t num_pages() const = 0; + + /// \brief Returns if all values are null in a single page. + virtual bool null_page(int64_t page_id) const = 0; + + /// \brief Returns whether both min_values and max_values are + /// orderd and if so, in which direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. + virtual bool HasNullCount() const = 0; + + /// \brief Returns null count for a single page. + virtual int64_t null_count(int64_t page_id) const = 0; + + /// \brief Returns all null indicator for each page in batch. + virtual const std::vector& GetNullPages() const = 0; + + /// \brief Returns null count for each page in batch. + virtual const std::vector& GetNullCounts() const = 0; + + /// \brief The minimum value of a single page. Throws if it is null page. + virtual std::string GetEncodedMin(int64_t page_id) const = 0; + + /// \brief The maximum value of a single page. Throws if it is null page. + virtual std::string GetEncodedMax(int64_t page_id) const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief The minimum value of a single page. Throws if it is null page. + virtual T min_value(int64_t page_id) const = 0; + + /// \brief The maximum value of a single page. Throws if it is null page. + virtual T max_value(int64_t page_id) const = 0; + + /// \brief The minimum value of every valid page. + virtual const std::vector& GetMinValues() const = 0; + + /// \brief The maximum value of every valid page. + virtual const std::vector& GetMaxValues() const = 0; + + /// \brief Returns list of page index of all valid pages. + /// It can be used to understand values returned from min_values/max_values. + virtual std::vector GetValidPageIndices() const = 0; +}; + +using BoolColumnIndex = TypedColumnIndex; +using Int32ColumnIndex = TypedColumnIndex; +using Int64ColumnIndex = TypedColumnIndex; +using FloatColumnIndex = TypedColumnIndex; +using DoubleColumnIndex = TypedColumnIndex; +using ByteArrayColumnIndex = TypedColumnIndex; +using FLBAColumnIndex = TypedColumnIndex; + +/// \brief PageLocation is a proxy around format::PageLocation. +struct PARQUET_EXPORT PageLocation { + /// File offset of the data page. + int64_t offset_; + /// Total compressed size of the data page and header. + int32_t compressed_page_size_; + // row id of the first row in the page within the row group. + int64_t first_row_index_; +}; + +/// \brief OffsetIndex is a proxy around format::OffsetIndex. +class PARQUET_EXPORT OffsetIndex { + public: + /// \brief Create a OffsetIndex from a serialized thrift message. + static std::unique_ptr Make(const void* serialized_index, + uint32_t* inout_index_len, + const ReaderProperties& properties); + + virtual ~OffsetIndex() = default; + + /// \brief Returns number of pages in this column index. + virtual int64_t num_pages() const = 0; + + /// \brief Returns offset for a single page. + virtual int64_t offset(int64_t page_id) const = 0; + + /// \brief Returns total compressed size for a single page. + virtual int32_t compressed_page_size(int64_t page_id) const = 0; + + /// \brief Returns row id of the first row for a single page. + virtual int64_t first_row_index(int64_t page_id) const = 0; + + /// \brief Returns all page locations in the offset index. + virtual const std::vector& GetPageLocations() const = 0; +}; + +} // namespace parquet From 655a78663226a5d9364400cc59530c24eb5e5a18 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 2 Dec 2022 11:00:18 +0800 Subject: [PATCH 02/19] renaming functions and refine some impls --- cpp/src/parquet/page_index.cc | 124 +++++++++++++++------------------- cpp/src/parquet/page_index.h | 48 ++++++------- 2 files changed, 77 insertions(+), 95 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index a3b57073064..8cba40358e4 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -27,17 +27,16 @@ namespace parquet { namespace { template -void PlainDecode(const ColumnDescriptor* descr, const std::string& src, - typename DType::c_type* dst) { - auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr); - decoder->SetData(1, reinterpret_cast(src.c_str()), +void Decode(std::unique_ptr::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast(src.c_str()), static_cast(src.size())); - decoder->Decode(dst, 1); + decoder->Decode(dst, /*max_values=*/1); } template <> -void PlainDecode(const ColumnDescriptor* descr, const std::string& src, - ByteArray* dst) { +void Decode(std::unique_ptr&, const std::string& src, + ByteArray* dst) { dst->len = static_cast(src.size()); dst->ptr = reinterpret_cast(src.c_str()); } @@ -47,25 +46,35 @@ class TypedColumnIndexImpl : public TypedColumnIndex { public: using T = typename DType::c_type; - explicit TypedColumnIndexImpl(const ColumnDescriptor* descr, + explicit TypedColumnIndexImpl(const ColumnDescriptor& descr, const std::vector& null_pages, const std::vector& min_values, const std::vector& max_values, const BoundaryOrder& boundary_order, const bool has_null_count = false, const std::vector& null_counts = {}) - : descr_(descr), - null_pages_(null_pages), + : null_pages_(null_pages), encoded_min_values_(min_values), encoded_max_values_(max_values), boundary_order_(boundary_order), has_null_count_(has_null_count), null_counts_(null_counts) { - /// Decode min and max values into a compact form (i.e. w/o null page) - DecodeValues(encoded_min_values_, encoded_max_values_); + // Decode min and max values into a compact form (i.e. w/o null page) + auto plain_decoder = MakeTypedDecoder(Encoding::PLAIN, &descr); + T value; + for (size_t i = 0; i < null_pages_.size(); ++i) { + if (!null_pages_[i]) { + // page index -> min/max slot index + page_indexes_.emplace(i, min_values_.size()); + Decode(plain_decoder, encoded_min_values_[i], &value); + min_values_.push_back(value); + Decode(plain_decoder, encoded_max_values_[i], &value); + max_values_.push_back(value); + } + } } - explicit TypedColumnIndexImpl(const ColumnDescriptor* descr, + explicit TypedColumnIndexImpl(const ColumnDescriptor& descr, const format::ColumnIndex& column_index) : TypedColumnIndexImpl( descr, column_index.null_pages, column_index.min_values, @@ -75,7 +84,7 @@ class TypedColumnIndexImpl : public TypedColumnIndex { int64_t num_pages() const override { return static_cast(null_pages_.size()); } - bool null_page(int64_t page_id) const override { + bool is_null_page(int64_t page_id) const override { if (page_id >= num_pages()) { throw ParquetException("Page index is out of bound"); } @@ -84,7 +93,7 @@ class TypedColumnIndexImpl : public TypedColumnIndex { BoundaryOrder boundary_order() const override { return boundary_order_; } - bool HasNullCount() const override { return has_null_count_; } + bool has_null_counts() const override { return has_null_count_; } int64_t null_count(int64_t page_id) const override { if (page_id >= num_pages()) { @@ -101,21 +110,27 @@ class TypedColumnIndexImpl : public TypedColumnIndex { return max_values_[GetMinMaxSlot(page_id)]; } - std::string GetEncodedMin(int64_t page_id) const override { + const std::string& encoded_min(int64_t page_id) const override { + if (page_id >= num_pages()) { + throw ParquetException("Page index is out of bound"); + } return encoded_min_values_[page_id]; } - std::string GetEncodedMax(int64_t page_id) const override { + const std::string& encoded_max(int64_t page_id) const override { + if (page_id >= num_pages()) { + throw ParquetException("Page index is out of bound"); + } return encoded_max_values_[page_id]; } - const std::vector& GetNullPages() const override { return null_pages_; } + const std::vector& null_pages() const override { return null_pages_; } - const std::vector& GetNullCounts() const override { return null_counts_; } + const std::vector& null_counts() const override { return null_counts_; } - const std::vector& GetMinValues() const override { return min_values_; } + const std::vector& min_values() const override { return min_values_; } - const std::vector& GetMaxValues() const override { return max_values_; } + const std::vector& max_values() const override { return max_values_; } std::vector GetValidPageIndices() const override { std::vector valid_page_indices; @@ -129,10 +144,10 @@ class TypedColumnIndexImpl : public TypedColumnIndex { private: size_t GetMinMaxSlot(int64_t page_id) const { if (page_id >= static_cast(null_pages_.size())) { - throw ParquetException("page index is out of bound"); + throw ParquetException("Page index is out of bound"); } if (null_pages_[page_id]) { - throw ParquetException("cannot get min/max value of null page"); + throw ParquetException("Cannot get min/max value of null page"); } auto iter = page_indexes_.find(page_id); if (iter == page_indexes_.cend()) { @@ -141,33 +156,20 @@ class TypedColumnIndexImpl : public TypedColumnIndex { return iter->second; } - void DecodeValues(const std::vector& min_values, - const std::vector& max_values) { - T value; - for (size_t i = 0; i < null_pages_.size(); ++i) { - if (!null_pages_[i]) { - // page index -> min/max slot index - page_indexes_.emplace(i, min_values_.size()); - - PlainDecode(descr_, min_values[i], &value); - min_values_.push_back(value); - PlainDecode(descr_, max_values[i], &value); - max_values_.push_back(value); - } - } - } - - const ColumnDescriptor* descr_; + /// Values that are copied directly from the thrift message. std::vector null_pages_; std::vector encoded_min_values_; std::vector encoded_max_values_; + BoundaryOrder boundary_order_; + bool has_null_count_; + std::vector null_counts_; + /// page_id -> slot_id in the buffer of min_values_ & max_values_ std::map page_indexes_; + + /// Decoded typed min/max values. std::vector min_values_; std::vector max_values_; - BoundaryOrder boundary_order_; - bool has_null_count_; - std::vector null_counts_; }; class OffsetIndexImpl : public OffsetIndex { @@ -179,33 +181,19 @@ class OffsetIndexImpl : public OffsetIndex { for (const auto& page_location : offset_index.page_locations) { page_locations_.emplace_back(); auto& location = page_locations_.back(); - location.offset_ = page_location.offset; - location.compressed_page_size_ = page_location.compressed_page_size; - location.first_row_index_ = page_location.first_row_index; + location.offset = page_location.offset; + location.compressed_page_size = page_location.compressed_page_size; + location.first_row_index = page_location.first_row_index; } } int64_t num_pages() const override { return page_locations_.size(); } - int64_t offset(int64_t page_id) const override { - if (page_id >= num_pages()) { - throw ParquetException("Page index is out of bound"); - } - return page_locations_[page_id].offset_; - } - - int32_t compressed_page_size(int64_t page_id) const override { - if (page_id >= num_pages()) { - throw ParquetException("Page index is out of bound"); - } - return page_locations_[page_id].compressed_page_size_; - } - - int64_t first_row_index(int64_t page_id) const override { + const PageLocation& GetPageLocation(int64_t page_id) const override { if (page_id >= num_pages()) { throw ParquetException("Page index is out of bound"); } - return page_locations_[page_id].first_row_index_; + return page_locations_[page_id]; } const std::vector& GetPageLocations() const override { @@ -221,15 +209,15 @@ class OffsetIndexImpl : public OffsetIndex { // ---------------------------------------------------------------------- // Public factory functions -std::unique_ptr ColumnIndex::Make(const ColumnDescriptor* descr, +std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, const void* serialized_index, - uint32_t* inout_index_len, + uint32_t index_len, const ReaderProperties& properties) { format::ColumnIndex column_index; ThriftDeserializer deserializer(properties); deserializer.DeserializeMessage(reinterpret_cast(serialized_index), - inout_index_len, &column_index); - switch (descr->physical_type()) { + &index_len, &column_index); + switch (descr.physical_type()) { case Type::BOOLEAN: return std::make_unique>(descr, column_index); case Type::INT32: @@ -254,12 +242,12 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor* descr, } std::unique_ptr OffsetIndex::Make(const void* serialized_index, - uint32_t* inout_index_len, + uint32_t index_len, const ReaderProperties& properties) { format::OffsetIndex offset_index; ThriftDeserializer deserializer(properties); deserializer.DeserializeMessage(reinterpret_cast(serialized_index), - inout_index_len, &offset_index); + &index_len, &offset_index); return std::make_unique(offset_index); } diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index 162c617126e..d1c89c4c028 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -31,15 +31,15 @@ namespace parquet { /// \brief BoundaryOrder is a proxy around format::BoundaryOrder. -enum class PARQUET_EXPORT BoundaryOrder { UNORDERED = 0, ASCENDING = 1, DESCENDING = 2 }; +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; /// \brief ColumnIndex is a proxy around format::ColumnIndex. class PARQUET_EXPORT ColumnIndex { public: /// \brief Create a ColumnIndex from a serialized thrift message. - static std::unique_ptr Make(const ColumnDescriptor* descr, + static std::unique_ptr Make(const ColumnDescriptor& descr, const void* serialized_index, - uint32_t* inout_index_len, + uint32_t index_len, const ReaderProperties& properties); virtual ~ColumnIndex() = default; @@ -48,29 +48,29 @@ class PARQUET_EXPORT ColumnIndex { virtual int64_t num_pages() const = 0; /// \brief Returns if all values are null in a single page. - virtual bool null_page(int64_t page_id) const = 0; + virtual bool is_null_page(int64_t page_id) const = 0; /// \brief Returns whether both min_values and max_values are /// orderd and if so, in which direction. virtual BoundaryOrder boundary_order() const = 0; /// \brief Returns if null count is available. - virtual bool HasNullCount() const = 0; + virtual bool has_null_counts() const = 0; /// \brief Returns null count for a single page. virtual int64_t null_count(int64_t page_id) const = 0; - /// \brief Returns all null indicator for each page in batch. - virtual const std::vector& GetNullPages() const = 0; - - /// \brief Returns null count for each page in batch. - virtual const std::vector& GetNullCounts() const = 0; - /// \brief The minimum value of a single page. Throws if it is null page. - virtual std::string GetEncodedMin(int64_t page_id) const = 0; + virtual const std::string& encoded_min(int64_t page_id) const = 0; /// \brief The maximum value of a single page. Throws if it is null page. - virtual std::string GetEncodedMax(int64_t page_id) const = 0; + virtual const std::string& encoded_max(int64_t page_id) const = 0; + + /// \brief Returns all null indicator for each page in batch. + virtual const std::vector& null_pages() const = 0; + + /// \brief Returns null count for each page in batch. + virtual const std::vector& null_counts() const = 0; }; /// \brief Typed implementation of ColumnIndex. @@ -86,10 +86,10 @@ class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { virtual T max_value(int64_t page_id) const = 0; /// \brief The minimum value of every valid page. - virtual const std::vector& GetMinValues() const = 0; + virtual const std::vector& min_values() const = 0; /// \brief The maximum value of every valid page. - virtual const std::vector& GetMaxValues() const = 0; + virtual const std::vector& max_values() const = 0; /// \brief Returns list of page index of all valid pages. /// It can be used to understand values returned from min_values/max_values. @@ -107,11 +107,11 @@ using FLBAColumnIndex = TypedColumnIndex; /// \brief PageLocation is a proxy around format::PageLocation. struct PARQUET_EXPORT PageLocation { /// File offset of the data page. - int64_t offset_; + int64_t offset; /// Total compressed size of the data page and header. - int32_t compressed_page_size_; + int32_t compressed_page_size; // row id of the first row in the page within the row group. - int64_t first_row_index_; + int64_t first_row_index; }; /// \brief OffsetIndex is a proxy around format::OffsetIndex. @@ -119,7 +119,7 @@ class PARQUET_EXPORT OffsetIndex { public: /// \brief Create a OffsetIndex from a serialized thrift message. static std::unique_ptr Make(const void* serialized_index, - uint32_t* inout_index_len, + uint32_t index_len, const ReaderProperties& properties); virtual ~OffsetIndex() = default; @@ -127,14 +127,8 @@ class PARQUET_EXPORT OffsetIndex { /// \brief Returns number of pages in this column index. virtual int64_t num_pages() const = 0; - /// \brief Returns offset for a single page. - virtual int64_t offset(int64_t page_id) const = 0; - - /// \brief Returns total compressed size for a single page. - virtual int32_t compressed_page_size(int64_t page_id) const = 0; - - /// \brief Returns row id of the first row for a single page. - virtual int64_t first_row_index(int64_t page_id) const = 0; + /// \brief Returns PageLocation of a single page. + virtual const PageLocation& GetPageLocation(int64_t page_id) const = 0; /// \brief Returns all page locations in the offset index. virtual const std::vector& GetPageLocations() const = 0; From 5175b45b9a8b75e44cec47d5d23af0135ea04887 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 2 Dec 2022 23:31:15 +0800 Subject: [PATCH 03/19] remove per page accessors --- cpp/src/parquet/page_index.cc | 156 ++++++++-------------------------- cpp/src/parquet/page_index.h | 67 +++++---------- 2 files changed, 58 insertions(+), 165 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 8cba40358e4..067076c2981 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/page_index.h" #include "parquet/encoding.h" -#include "parquet/metadata.h" +#include "parquet/page_index.h" +#include "parquet/statistics.h" #include "parquet/thrift_internal.h" #include @@ -46,154 +46,70 @@ class TypedColumnIndexImpl : public TypedColumnIndex { public: using T = typename DType::c_type; - explicit TypedColumnIndexImpl(const ColumnDescriptor& descr, - const std::vector& null_pages, - const std::vector& min_values, - const std::vector& max_values, - const BoundaryOrder& boundary_order, - const bool has_null_count = false, - const std::vector& null_counts = {}) - : null_pages_(null_pages), - encoded_min_values_(min_values), - encoded_max_values_(max_values), - boundary_order_(boundary_order), - has_null_count_(has_null_count), - null_counts_(null_counts) { + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + min_values_.reserve(column_index_.null_pages.size()); + max_values_.reserve(column_index_.null_pages.size()); // Decode min and max values into a compact form (i.e. w/o null page) auto plain_decoder = MakeTypedDecoder(Encoding::PLAIN, &descr); T value; - for (size_t i = 0; i < null_pages_.size(); ++i) { - if (!null_pages_[i]) { - // page index -> min/max slot index - page_indexes_.emplace(i, min_values_.size()); - Decode(plain_decoder, encoded_min_values_[i], &value); + for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { + if (column_index_.null_pages[i]) { + min_values_.push_back(std::nullopt); + max_values_.push_back(std::nullopt); + } else { + Decode(plain_decoder, column_index_.min_values[i], &value); min_values_.push_back(value); - Decode(plain_decoder, encoded_max_values_[i], &value); + Decode(plain_decoder, column_index_.max_values[i], &value); max_values_.push_back(value); } } } - explicit TypedColumnIndexImpl(const ColumnDescriptor& descr, - const format::ColumnIndex& column_index) - : TypedColumnIndexImpl( - descr, column_index.null_pages, column_index.min_values, - column_index.max_values, - static_cast(static_cast(column_index.boundary_order)), - column_index.__isset.null_counts, column_index.null_counts) {} - - int64_t num_pages() const override { return static_cast(null_pages_.size()); } - - bool is_null_page(int64_t page_id) const override { - if (page_id >= num_pages()) { - throw ParquetException("Page index is out of bound"); - } - return null_pages_[page_id]; + const std::vector& null_pages() const override { + return column_index_.null_pages; } - BoundaryOrder boundary_order() const override { return boundary_order_; } - - bool has_null_counts() const override { return has_null_count_; } - - int64_t null_count(int64_t page_id) const override { - if (page_id >= num_pages()) { - throw ParquetException("Page index is out of bound"); - } - return null_counts_[page_id]; + const std::vector& encoded_min_values() const override { + return column_index_.min_values; } - T min_value(int64_t page_id) const override { - return min_values_[GetMinMaxSlot(page_id)]; + const std::vector& encoded_max_values() const override { + return column_index_.max_values; } - T max_value(int64_t page_id) const override { - return max_values_[GetMinMaxSlot(page_id)]; + BoundaryOrder boundary_order() const override { + return static_cast(static_cast(column_index_.boundary_order)); } - const std::string& encoded_min(int64_t page_id) const override { - if (page_id >= num_pages()) { - throw ParquetException("Page index is out of bound"); - } - return encoded_min_values_[page_id]; - } + bool has_null_counts() const override { return column_index_.__isset.null_counts; } - const std::string& encoded_max(int64_t page_id) const override { - if (page_id >= num_pages()) { - throw ParquetException("Page index is out of bound"); - } - return encoded_max_values_[page_id]; + const std::vector& null_counts() const override { + return column_index_.null_counts; } - const std::vector& null_pages() const override { return null_pages_; } + const std::vector>& min_values() const override { return min_values_; } - const std::vector& null_counts() const override { return null_counts_; } - - const std::vector& min_values() const override { return min_values_; } - - const std::vector& max_values() const override { return max_values_; } - - std::vector GetValidPageIndices() const override { - std::vector valid_page_indices; - std::for_each(page_indexes_.cbegin(), page_indexes_.cend(), - [&](const std::pair& v) { - valid_page_indices.push_back(v.first); - }); - return valid_page_indices; - } + const std::vector>& max_values() const override { return max_values_; } private: - size_t GetMinMaxSlot(int64_t page_id) const { - if (page_id >= static_cast(null_pages_.size())) { - throw ParquetException("Page index is out of bound"); - } - if (null_pages_[page_id]) { - throw ParquetException("Cannot get min/max value of null page"); - } - auto iter = page_indexes_.find(page_id); - if (iter == page_indexes_.cend()) { - throw ParquetException("min/max value is unavailable"); - } - return iter->second; - } - - /// Values that are copied directly from the thrift message. - std::vector null_pages_; - std::vector encoded_min_values_; - std::vector encoded_max_values_; - BoundaryOrder boundary_order_; - bool has_null_count_; - std::vector null_counts_; - - /// page_id -> slot_id in the buffer of min_values_ & max_values_ - std::map page_indexes_; - - /// Decoded typed min/max values. - std::vector min_values_; - std::vector max_values_; + /// Wrapped thrift column index. + const format::ColumnIndex column_index_; + /// Decoded typed min/max values. Null pages are set to std::nullopt. + std::vector> min_values_; + std::vector> max_values_; }; class OffsetIndexImpl : public OffsetIndex { public: - explicit OffsetIndexImpl(std::vector page_locations) - : page_locations_(std::move(page_locations)) {} - explicit OffsetIndexImpl(const format::OffsetIndex& offset_index) { + page_locations_.reserve(offset_index.page_locations.size()); for (const auto& page_location : offset_index.page_locations) { - page_locations_.emplace_back(); - auto& location = page_locations_.back(); - location.offset = page_location.offset; - location.compressed_page_size = page_location.compressed_page_size; - location.first_row_index = page_location.first_row_index; - } - } - - int64_t num_pages() const override { return page_locations_.size(); } - - const PageLocation& GetPageLocation(int64_t page_id) const override { - if (page_id >= num_pages()) { - throw ParquetException("Page index is out of bound"); + page_locations_.emplace_back(PageLocation{page_location.offset, + page_location.compressed_page_size, + page_location.first_row_index}); } - return page_locations_[page_id]; } const std::vector& GetPageLocations() const override { diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index d1c89c4c028..dc0eded6e7f 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -17,16 +17,13 @@ #pragma once -#include -#include -#include +#include #include -#include #include +#include "parquet/exception.h" #include "parquet/platform.h" -#include "parquet/statistics.h" -#include "parquet/types.h" +#include "parquet/schema.h" namespace parquet { @@ -44,32 +41,28 @@ class PARQUET_EXPORT ColumnIndex { virtual ~ColumnIndex() = default; - /// \brief Returns number of pages in this column index. - virtual int64_t num_pages() const = 0; + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector& encoded_min_values() const = 0; - /// \brief Returns if all values are null in a single page. - virtual bool is_null_page(int64_t page_id) const = 0; + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector& encoded_max_values() const = 0; - /// \brief Returns whether both min_values and max_values are - /// orderd and if so, in which direction. + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. virtual BoundaryOrder boundary_order() const = 0; /// \brief Returns if null count is available. virtual bool has_null_counts() const = 0; - /// \brief Returns null count for a single page. - virtual int64_t null_count(int64_t page_id) const = 0; - - /// \brief The minimum value of a single page. Throws if it is null page. - virtual const std::string& encoded_min(int64_t page_id) const = 0; - - /// \brief The maximum value of a single page. Throws if it is null page. - virtual const std::string& encoded_max(int64_t page_id) const = 0; - - /// \brief Returns all null indicator for each page in batch. - virtual const std::vector& null_pages() const = 0; - - /// \brief Returns null count for each page in batch. + /// \brief Returns A list containing the number of null values for each page. virtual const std::vector& null_counts() const = 0; }; @@ -79,21 +72,11 @@ class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { public: using T = typename DType::c_type; - /// \brief The minimum value of a single page. Throws if it is null page. - virtual T min_value(int64_t page_id) const = 0; - - /// \brief The maximum value of a single page. Throws if it is null page. - virtual T max_value(int64_t page_id) const = 0; - - /// \brief The minimum value of every valid page. - virtual const std::vector& min_values() const = 0; + /// \brief Returns a list of lower bound for the values of every page. + virtual const std::vector>& min_values() const = 0; - /// \brief The maximum value of every valid page. - virtual const std::vector& max_values() const = 0; - - /// \brief Returns list of page index of all valid pages. - /// It can be used to understand values returned from min_values/max_values. - virtual std::vector GetValidPageIndices() const = 0; + /// \brief Returns a list of upper bound for the values of every page. + virtual const std::vector>& max_values() const = 0; }; using BoolColumnIndex = TypedColumnIndex; @@ -124,12 +107,6 @@ class PARQUET_EXPORT OffsetIndex { virtual ~OffsetIndex() = default; - /// \brief Returns number of pages in this column index. - virtual int64_t num_pages() const = 0; - - /// \brief Returns PageLocation of a single page. - virtual const PageLocation& GetPageLocation(int64_t page_id) const = 0; - /// \brief Returns all page locations in the offset index. virtual const std::vector& GetPageLocations() const = 0; }; From 80db27268e887045ce740fb8eaba79d20649af5d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 2 Dec 2022 23:45:17 +0800 Subject: [PATCH 04/19] fix lint --- cpp/src/parquet/page_index.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 067076c2981..8b9944b2578 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/encoding.h" #include "parquet/page_index.h" +#include "parquet/encoding.h" #include "parquet/statistics.h" #include "parquet/thrift_internal.h" From 776a6021758ff0c764d33c33b613907ba8825175 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 3 Dec 2022 09:03:45 +0800 Subject: [PATCH 05/19] add back GetNonNullPageIndices --- cpp/src/parquet/page_index.cc | 24 ++++++++++++++---------- cpp/src/parquet/page_index.h | 21 +++++++++++++-------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 8b9944b2578..7c1351a6a96 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -55,14 +55,12 @@ class TypedColumnIndexImpl : public TypedColumnIndex { auto plain_decoder = MakeTypedDecoder(Encoding::PLAIN, &descr); T value; for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { - if (column_index_.null_pages[i]) { - min_values_.push_back(std::nullopt); - max_values_.push_back(std::nullopt); - } else { + if (!column_index_.null_pages[i]) { + non_null_page_indices_.emplace_back(static_cast(i)); Decode(plain_decoder, column_index_.min_values[i], &value); - min_values_.push_back(value); + min_values_.emplace_back(value); Decode(plain_decoder, column_index_.max_values[i], &value); - max_values_.push_back(value); + max_values_.emplace_back(value); } } } @@ -89,16 +87,22 @@ class TypedColumnIndexImpl : public TypedColumnIndex { return column_index_.null_counts; } - const std::vector>& min_values() const override { return min_values_; } + const std::vector& min_values() const override { return min_values_; } - const std::vector>& max_values() const override { return max_values_; } + const std::vector& max_values() const override { return max_values_; } + + const std::vector GetNonNullPageIndices() const override { + return non_null_page_indices_; + } private: /// Wrapped thrift column index. const format::ColumnIndex column_index_; /// Decoded typed min/max values. Null pages are set to std::nullopt. - std::vector> min_values_; - std::vector> max_values_; + std::vector min_values_; + std::vector max_values_; + /// A list of page indices for not-null pages. + std::vector non_null_page_indices_; }; class OffsetIndexImpl : public OffsetIndex { diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index dc0eded6e7f..0b45afe2752 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -17,14 +17,12 @@ #pragma once -#include -#include -#include - #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" +#include + namespace parquet { /// \brief BoundaryOrder is a proxy around format::BoundaryOrder. @@ -72,11 +70,18 @@ class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { public: using T = typename DType::c_type; - /// \brief Returns a list of lower bound for the values of every page. - virtual const std::vector>& min_values() const = 0; + /// \brief Returns a list of lower bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector& min_values() const = 0; + + /// \brief Returns a list of upper bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector& max_values() const = 0; - /// \brief Returns a list of upper bound for the values of every page. - virtual const std::vector>& max_values() const = 0; + /// \brief Returns a list of page indices for not-null pages. It is helpful to + /// understand the original page id in the values returned from min_values() + /// and max_values() above. + virtual const std::vector GetNonNullPageIndices() const = 0; }; using BoolColumnIndex = TypedColumnIndex; From 275c331bc5e74474aa1a1a81f16e8d3f0dd04c72 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 6 Dec 2022 14:55:13 +0800 Subject: [PATCH 06/19] refine comments and use LoadEnumSafe --- cpp/src/parquet/page_index.cc | 54 ++++++++++++++--------- cpp/src/parquet/page_index.h | 71 ++++++++++++++++++------------- cpp/src/parquet/thrift_internal.h | 9 ++++ cpp/src/parquet/types.h | 11 +++++ 4 files changed, 95 insertions(+), 50 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 7c1351a6a96..1560e87abd1 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -17,10 +17,16 @@ #include "parquet/page_index.h" #include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/schema.h" #include "parquet/statistics.h" #include "parquet/thrift_internal.h" +#include "arrow/util/unreachable.h" + +#include #include +#include namespace parquet { @@ -37,8 +43,9 @@ void Decode(std::unique_ptr::Decoder>& decoder, template <> void Decode(std::unique_ptr&, const std::string& src, ByteArray* dst) { + DCHECK_LE(src.size(), std::numeric_limits::max()); dst->len = static_cast(src.size()); - dst->ptr = reinterpret_cast(src.c_str()); + dst->ptr = reinterpret_cast(src.data()); } template @@ -49,18 +56,25 @@ class TypedColumnIndexImpl : public TypedColumnIndex { TypedColumnIndexImpl(const ColumnDescriptor& descr, const format::ColumnIndex& column_index) : column_index_(column_index) { - min_values_.reserve(column_index_.null_pages.size()); - max_values_.reserve(column_index_.null_pages.size()); - // Decode min and max values into a compact form (i.e. w/o null page) - auto plain_decoder = MakeTypedDecoder(Encoding::PLAIN, &descr); - T value; - for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { - if (!column_index_.null_pages[i]) { - non_null_page_indices_.emplace_back(static_cast(i)); - Decode(plain_decoder, column_index_.min_values[i], &value); - min_values_.emplace_back(value); - Decode(plain_decoder, column_index_.max_values[i], &value); - max_values_.emplace_back(value); + size_t num_non_null_pages = std::accumulate( + column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0U, + [](size_t num_non_null_pages, bool null_page) { + return num_non_null_pages + (null_page ? 0U : 1U); + }); + if (num_non_null_pages != 0U) { + min_values_.reserve(num_non_null_pages); + max_values_.reserve(num_non_null_pages); + // Decode min and max values into a compact form (i.e. w/o null page) + auto plain_decoder = MakeTypedDecoder(Encoding::PLAIN, &descr); + T value; + for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { + if (!column_index_.null_pages[i]) { + non_null_page_indices_.emplace_back(static_cast(i)); + Decode(plain_decoder, column_index_.min_values[i], &value); + min_values_.emplace_back(value); + Decode(plain_decoder, column_index_.max_values[i], &value); + max_values_.emplace_back(value); + } } } } @@ -77,8 +91,8 @@ class TypedColumnIndexImpl : public TypedColumnIndex { return column_index_.max_values; } - BoundaryOrder boundary_order() const override { - return static_cast(static_cast(column_index_.boundary_order)); + BoundaryOrder::type boundary_order() const override { + return LoadEnumSafe(&column_index_.boundary_order); } bool has_null_counts() const override { return column_index_.__isset.null_counts; } @@ -91,7 +105,7 @@ class TypedColumnIndexImpl : public TypedColumnIndex { const std::vector& max_values() const override { return max_values_; } - const std::vector GetNonNullPageIndices() const override { + const std::vector& non_null_page_indices() const override { return non_null_page_indices_; } @@ -116,7 +130,7 @@ class OffsetIndexImpl : public OffsetIndex { } } - const std::vector& GetPageLocations() const override { + const std::vector& page_locations() const override { return page_locations_; } @@ -154,11 +168,9 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, return std::make_unique>(descr, column_index); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, column_index); - default: - break; + case Type::UNDEFINED: + ::arrow::Unreachable("Cannot make ColumnIndex of an unknown type"); } - DCHECK(false) << "Should not be able to reach this code"; - return nullptr; } std::unique_ptr OffsetIndex::Make(const void* serialized_index, diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index 0b45afe2752..7b03cff298a 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -17,16 +17,14 @@ #pragma once -#include "parquet/exception.h" -#include "parquet/platform.h" -#include "parquet/schema.h" +#include "parquet/types.h" #include namespace parquet { -/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. -enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; +class ColumnDescriptor; +class ReaderProperties; /// \brief ColumnIndex is a proxy around format::ColumnIndex. class PARQUET_EXPORT ColumnIndex { @@ -39,28 +37,37 @@ class PARQUET_EXPORT ColumnIndex { virtual ~ColumnIndex() = default; - /// \brief Returns a list of boolean values to determine the validity of the - /// corresponding min and max values. + /// \brief A bitmap with a bit set for each data page that has only null values. + /// + /// The length of this vector is equal to the number of data pages in the column. virtual const std::vector& null_pages() const = 0; - /// \brief Returns a list of encoded lower bound for the values of each page. For null - /// pages the default value is an empty string. Readers must make sure that list entries - /// are populated before using them by inspecting null_pages. + /// \brief A vector of encoded lower bounds for each data page in this column. + /// + /// `null_pages` should be inspected first, as only pages with non-null values + /// may have their lower bounds populated. virtual const std::vector& encoded_min_values() const = 0; - /// \brief Returns a list of encoded upper bound for the values of each page. For null - /// pages the default value is an empty string. Readers must make sure that list entries - /// are populated before using them by inspecting null_pages. + /// \brief A vector of encoded upper bounds for each data page in this column. + /// + /// `null_pages` should be inspected first, as only pages with non-null values + /// may have their upper bounds populated. virtual const std::vector& encoded_max_values() const = 0; - /// \brief Returns whether both min_values and max_values are orderd and if so, in which - /// direction. - virtual BoundaryOrder boundary_order() const = 0; + /// \brief The ordering of lower and upper bounds. + /// + /// The boundary order applies accross all lower bounds, and all upper bounds, + /// respectively. However, the order between lower bounds and upper bounds + /// cannot be derived from this. + virtual BoundaryOrder::type boundary_order() const = 0; - /// \brief Returns if null count is available. + /// \brief Whether per-page null count information is available. virtual bool has_null_counts() const = 0; - /// \brief Returns A list containing the number of null values for each page. + /// \brief An optional vector with the number of null values in each data page. + /// + /// `has_null_counts` should be called first to determine if this information is + /// available. virtual const std::vector& null_counts() const = 0; }; @@ -70,18 +77,24 @@ class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { public: using T = typename DType::c_type; - /// \brief Returns a list of lower bound for the values of every non-null page. - /// Excluding non-null pages helps binary search if the values are ordered. + /// \brief A vector of lower bounds for each data page in this column. + /// + /// This is like `encoded_min_values`, but with the values decoded according to + /// the column's physical type. + /// `min_values` and `max_values` can be used together with `boundary_order` + /// in order to prune some data pages when searching for specific values. virtual const std::vector& min_values() const = 0; - /// \brief Returns a list of upper bound for the values of every non-null page. - /// Excluding non-null pages helps binary search if the values are ordered. + /// \brief A vector of upper bounds for each data page in this column. + /// + /// Just like `min_values`, but for upper bounds instead of lower bounds. virtual const std::vector& max_values() const = 0; - /// \brief Returns a list of page indices for not-null pages. It is helpful to - /// understand the original page id in the values returned from min_values() - /// and max_values() above. - virtual const std::vector GetNonNullPageIndices() const = 0; + /// \brief A vector of page indices for not-null pages. + /// + /// It is helpful to understand the original page id in the values returned from + /// min_values() and max_values() above. + virtual const std::vector& non_null_page_indices() const = 0; }; using BoolColumnIndex = TypedColumnIndex; @@ -98,7 +111,7 @@ struct PARQUET_EXPORT PageLocation { int64_t offset; /// Total compressed size of the data page and header. int32_t compressed_page_size; - // row id of the first row in the page within the row group. + /// Row id of the first row in the page within the row group. int64_t first_row_index; }; @@ -112,8 +125,8 @@ class PARQUET_EXPORT OffsetIndex { virtual ~OffsetIndex() = default; - /// \brief Returns all page locations in the offset index. - virtual const std::vector& GetPageLocations() const = 0; + /// \brief A vector of locations for each data page in this column. + virtual const std::vector& page_locations() const = 0; }; } // namespace parquet diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 23d7bae59b7..9cc702dfcdd 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -100,6 +100,10 @@ static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type } } +static inline BoundaryOrder::type FromThriftUnsafe(format::BoundaryOrder::type type) { + return static_cast(type); +} + namespace internal { template @@ -130,6 +134,11 @@ struct ThriftEnumTypeTraits<::parquet::format::PageType::type> { using ParquetEnum = PageType; }; +template <> +struct ThriftEnumTypeTraits<::parquet::format::BoundaryOrder::type> { + using ParquetEnum = BoundaryOrder; +}; + // If the parquet file is corrupted it is possible the enum value decoded // will not be in the range of defined values, which is undefined behaviour. // This facility prevents this by loading the value as the underlying type diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 183a3705291..6ec6870d3a0 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -532,6 +532,17 @@ class ColumnOrder { ColumnOrder::type column_order_; }; +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +struct BoundaryOrder { + enum type { + Unordered = 0, + Ascending = 1, + Descending = 2, + // Should always be last element + UNDEFINED = 3 + }; +}; + // ---------------------------------------------------------------------- struct ByteArray { From c0e7c25b7b1f93b2be8f1af490da73132da5ccc0 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 6 Dec 2022 15:40:19 +0800 Subject: [PATCH 07/19] missing return after Unreachable --- cpp/src/parquet/page_index.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 1560e87abd1..f2e849e8c34 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -170,6 +170,7 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, return std::make_unique>(descr, column_index); case Type::UNDEFINED: ::arrow::Unreachable("Cannot make ColumnIndex of an unknown type"); + return nullptr; } } From 4d47e04a0988be22c411dca2c6d44e4e676a9930 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 6 Dec 2022 16:09:28 +0800 Subject: [PATCH 08/19] remove default in switch but unreachable is required for lint --- cpp/src/parquet/page_index.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index f2e849e8c34..5ccffbd12d2 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -169,9 +169,10 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, column_index); case Type::UNDEFINED: - ::arrow::Unreachable("Cannot make ColumnIndex of an unknown type"); return nullptr; } + ::arrow::Unreachable("Cannot make ColumnIndex of an unknown type"); + return nullptr; } std::unique_ptr OffsetIndex::Make(const void* serialized_index, From 6e2fd44af3fa686346eca50b4cc15300742f1374 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 6 Dec 2022 17:16:22 +0800 Subject: [PATCH 09/19] fix windows build --- cpp/src/parquet/page_index.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 5ccffbd12d2..66d101e6c05 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -43,7 +43,7 @@ void Decode(std::unique_ptr::Decoder>& decoder, template <> void Decode(std::unique_ptr&, const std::string& src, ByteArray* dst) { - DCHECK_LE(src.size(), std::numeric_limits::max()); + DCHECK_LE(src.size(), static_cast(std::numeric_limits::max())); dst->len = static_cast(src.size()); dst->ptr = reinterpret_cast(src.data()); } From 80c773cc9d985ba08c9e51a456df43f5efe41e17 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 6 Dec 2022 17:58:53 +0800 Subject: [PATCH 10/19] make windows build happy --- cpp/src/parquet/page_index.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 66d101e6c05..e2a322a7e1b 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -56,12 +56,12 @@ class TypedColumnIndexImpl : public TypedColumnIndex { TypedColumnIndexImpl(const ColumnDescriptor& descr, const format::ColumnIndex& column_index) : column_index_(column_index) { - size_t num_non_null_pages = std::accumulate( - column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0U, - [](size_t num_non_null_pages, bool null_page) { - return num_non_null_pages + (null_page ? 0U : 1U); + int32_t num_non_null_pages = std::accumulate( + column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0, + [](int32_t num_non_null_pages, bool null_page) { + return num_non_null_pages + (null_page ? 0 : 1); }); - if (num_non_null_pages != 0U) { + if (num_non_null_pages > 0) { min_values_.reserve(num_non_null_pages); max_values_.reserve(num_non_null_pages); // Decode min and max values into a compact form (i.e. w/o null page) From 9fac4e9a3d6f5b677abba2ba8da59ef38dd01f7d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 6 Dec 2022 23:23:04 +0800 Subject: [PATCH 11/19] make encoded & decoded values consistent --- cpp/src/parquet/page_index.cc | 103 ++++++++++++++++++++++++---------- cpp/src/parquet/page_index.h | 9 +-- 2 files changed, 75 insertions(+), 37 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index e2a322a7e1b..6ca3bc0d5ba 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -34,18 +34,47 @@ namespace { template void Decode(std::unique_ptr::Decoder>& decoder, - const std::string& src, typename DType::c_type* dst) { - decoder->SetData(/*num_values=*/1, reinterpret_cast(src.c_str()), - static_cast(src.size())); - decoder->Decode(dst, /*max_values=*/1); + const std::string& input, std::vector& output, + size_t index) { + if (ARROW_PREDICT_FALSE(index >= output.size())) { + throw ParquetException("Index out of bound"); + } + + decoder->SetData(/*num_values=*/1, reinterpret_cast(input.c_str()), + static_cast(input.size())); + decoder->Decode(&output[index], /*max_values=*/1); } template <> -void Decode(std::unique_ptr&, const std::string& src, - ByteArray* dst) { - DCHECK_LE(src.size(), static_cast(std::numeric_limits::max())); - dst->len = static_cast(src.size()); - dst->ptr = reinterpret_cast(src.data()); +void Decode(std::unique_ptr& decoder, + const std::string& input, std::vector& output, + size_t index) { + if (ARROW_PREDICT_FALSE(index >= output.size())) { + throw ParquetException("Index out of bound"); + } + + bool value; + decoder->SetData(/*num_values=*/1, reinterpret_cast(input.c_str()), + static_cast(input.size())); + decoder->Decode(&value, /*max_values=*/1); + output[index] = value; +} + +template <> +void Decode(std::unique_ptr&, const std::string& input, + std::vector& output, size_t index) { + if (ARROW_PREDICT_FALSE(index >= output.size())) { + throw ParquetException("Index out of bound"); + } + + if (ARROW_PREDICT_FALSE(input.size() > + static_cast(std::numeric_limits::max()))) { + throw ParquetException("Invalid encoded byte array length"); + } + + auto& decoded = output.at(index); + decoded.len = static_cast(input.size()); + decoded.ptr = reinterpret_cast(input.data()); } template @@ -56,27 +85,39 @@ class TypedColumnIndexImpl : public TypedColumnIndex { TypedColumnIndexImpl(const ColumnDescriptor& descr, const format::ColumnIndex& column_index) : column_index_(column_index) { - int32_t num_non_null_pages = std::accumulate( + // Make sure the number of pages is valid and it does not overflow to int32_t. + if (column_index_.null_pages.size() != column_index_.min_values.size() || + column_index_.min_values.size() != column_index_.max_values.size() || + ARROW_PREDICT_FALSE(column_index_.null_pages.size() >= + static_cast(std::numeric_limits::max()))) { + throw ParquetException("Invalid column index"); + } + + size_t num_pages = column_index_.null_pages.size(); + size_t num_non_null_pages = static_cast(std::accumulate( column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0, [](int32_t num_non_null_pages, bool null_page) { return num_non_null_pages + (null_page ? 0 : 1); - }); - if (num_non_null_pages > 0) { - min_values_.reserve(num_non_null_pages); - max_values_.reserve(num_non_null_pages); - // Decode min and max values into a compact form (i.e. w/o null page) - auto plain_decoder = MakeTypedDecoder(Encoding::PLAIN, &descr); - T value; - for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { - if (!column_index_.null_pages[i]) { - non_null_page_indices_.emplace_back(static_cast(i)); - Decode(plain_decoder, column_index_.min_values[i], &value); - min_values_.emplace_back(value); - Decode(plain_decoder, column_index_.max_values[i], &value); - max_values_.emplace_back(value); - } + })); + DCHECK_LE(num_non_null_pages, num_pages); + + // Allocate slots for decoded values. + min_values_.resize(num_pages); + max_values_.resize(num_pages); + non_null_page_indices_.reserve(num_non_null_pages); + + // Decode min and max values according to the physical type. + // Note that null page are skipped. + auto plain_decoder = MakeTypedDecoder(Encoding::PLAIN, &descr); + for (size_t i = 0; i < num_pages; ++i) { + if (!column_index_.null_pages[i]) { + // The check on `num_pages` has guaranteed the cast below is safe. + non_null_page_indices_.emplace_back(static_cast(i)); + Decode(plain_decoder, column_index_.min_values[i], min_values_, i); + Decode(plain_decoder, column_index_.max_values[i], max_values_, i); } } + DCHECK_EQ(num_non_null_pages, non_null_page_indices_.size()); } const std::vector& null_pages() const override { @@ -101,21 +142,21 @@ class TypedColumnIndexImpl : public TypedColumnIndex { return column_index_.null_counts; } - const std::vector& min_values() const override { return min_values_; } - - const std::vector& max_values() const override { return max_values_; } - const std::vector& non_null_page_indices() const override { return non_null_page_indices_; } + const std::vector& min_values() const override { return min_values_; } + + const std::vector& max_values() const override { return max_values_; } + private: /// Wrapped thrift column index. const format::ColumnIndex column_index_; - /// Decoded typed min/max values. Null pages are set to std::nullopt. + /// Decoded typed min/max values. Undefined for null pages. std::vector min_values_; std::vector max_values_; - /// A list of page indices for not-null pages. + /// A list of page indices for non-null pages. std::vector non_null_page_indices_; }; diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index 7b03cff298a..13dae40f56c 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -69,6 +69,9 @@ class PARQUET_EXPORT ColumnIndex { /// `has_null_counts` should be called first to determine if this information is /// available. virtual const std::vector& null_counts() const = 0; + + /// \brief A vector of page indices for non-null pages. + virtual const std::vector& non_null_page_indices() const = 0; }; /// \brief Typed implementation of ColumnIndex. @@ -89,12 +92,6 @@ class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { /// /// Just like `min_values`, but for upper bounds instead of lower bounds. virtual const std::vector& max_values() const = 0; - - /// \brief A vector of page indices for not-null pages. - /// - /// It is helpful to understand the original page id in the values returned from - /// min_values() and max_values() above. - virtual const std::vector& non_null_page_indices() const = 0; }; using BoolColumnIndex = TypedColumnIndex; From ed31adba27910ee9741bc0682374ecff9b1abf5c Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 7 Dec 2022 15:42:06 +0800 Subject: [PATCH 12/19] add tests --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/metadata.cc | 6 +- cpp/src/parquet/metadata.h | 2 +- cpp/src/parquet/metadata_test.cc | 2 +- cpp/src/parquet/page_index_test.cc | 208 +++++++++++++++++++++++++++++ 5 files changed, 214 insertions(+), 5 deletions(-) create mode 100644 cpp/src/parquet/page_index_test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 4f31e9ef4d1..46df502decd 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -326,6 +326,7 @@ add_parquet_test(internals-test statistics_test.cc encoding_test.cc metadata_test.cc + page_index_test.cc public_api_test.cc types_test.cc test_util.cc) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1e1f96d906a..c39dc706b8e 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -312,7 +312,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } } - std::optional GetColumIndexLocation() const { + std::optional GetColumnIndexLocation() const { if (column_->__isset.column_index_offset && column_->__isset.column_index_length) { return IndexLocation{column_->column_index_offset, column_->column_index_length}; } @@ -434,8 +434,8 @@ std::unique_ptr ColumnChunkMetaData::crypto_metadata() con return impl_->crypto_metadata(); } -std::optional ColumnChunkMetaData::GetColumIndexLocation() const { - return impl_->GetColumIndexLocation(); +std::optional ColumnChunkMetaData::GetColumnIndexLocation() const { + return impl_->GetColumnIndexLocation(); } std::optional ColumnChunkMetaData::GetOffsetIndexLocation() const { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 8c619c5c63b..40ff2aacc88 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -179,7 +179,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; std::unique_ptr crypto_metadata() const; - std::optional GetColumIndexLocation() const; + std::optional GetColumnIndexLocation() const; std::optional GetOffsetIndexLocation() const; private: diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index cabfb8078cf..a0989ad73e3 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -314,7 +314,7 @@ TEST(Metadata, TestReadPageIndex) { 5280, 9735, 3521, 10545, 3251, 3251}; for (int i = 0; i < row_group_metadata->num_columns(); ++i) { auto col_chunk_metadata = row_group_metadata->ColumnChunk(i); - auto ci_location = col_chunk_metadata->GetColumIndexLocation(); + auto ci_location = col_chunk_metadata->GetColumnIndexLocation(); if (i == 10) { // column_id 10 does not have column index ASSERT_FALSE(ci_location.has_value()); diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc new file mode 100644 index 00000000000..3c2b52bbbab --- /dev/null +++ b/cpp/src/parquet/page_index_test.cc @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" + +#include + +#include "arrow/io/file.h" +#include "parquet/file_reader.h" +#include "parquet/schema.h" +#include "parquet/test_util.h" +#include "parquet/thrift_internal.h" + +namespace parquet { + +TEST(PageIndex, ReadOffsetIndex) { + std::string dir_string(parquet::test::get_data_dir()); + std::string path = dir_string + "/alltypes_tiny_pages.parquet"; + auto reader = ParquetFileReader::OpenFile(path, false); + auto file_metadata = reader->metadata(); + + // Get offset index location to column 0 of row group 0. + const int row_group_id = 0; + const int column_id = 0; + ASSERT_LT(row_group_id, file_metadata->num_row_groups()); + ASSERT_LT(column_id, file_metadata->num_columns()); + auto index_location = file_metadata->RowGroup(row_group_id) + ->ColumnChunk(column_id) + ->GetOffsetIndexLocation(); + ASSERT_TRUE(index_location.has_value()); + + // Read serialized offset index from the file. + std::shared_ptr<::arrow::io::RandomAccessFile> source; + PARQUET_ASSIGN_OR_THROW(source, ::arrow::io::ReadableFile::Open(path)); + PARQUET_ASSIGN_OR_THROW(auto buffer, + source->ReadAt(index_location->offset, index_location->length)); + PARQUET_THROW_NOT_OK(source->Close()); + + // Deserialize offset index. + auto properties = default_reader_properties(); + std::unique_ptr offset_index = OffsetIndex::Make( + buffer->data(), static_cast(buffer->size()), properties); + + // Verify only partial data as it contains 325 pages in total. + const size_t num_pages = 325; + const std::vector page_indices = {0, 100, 200, 300}; + const std::vector page_locations = { + PageLocation{4, 109, 0}, PageLocation{11480, 133, 2244}, + PageLocation{22980, 133, 4494}, PageLocation{34480, 133, 6744}}; + + ASSERT_EQ(num_pages, offset_index->page_locations().size()); + for (size_t i = 0; i < page_indices.size(); ++i) { + size_t page_id = page_indices.at(i); + const auto& read_page_location = offset_index->page_locations().at(page_id); + const auto& expected_page_location = page_locations.at(i); + ASSERT_EQ(expected_page_location.offset, read_page_location.offset); + ASSERT_EQ(expected_page_location.compressed_page_size, + read_page_location.compressed_page_size); + ASSERT_EQ(expected_page_location.first_row_index, read_page_location.first_row_index); + } +} + +template +void TestReadTypedColumnIndex( + int column_id, size_t num_pages, BoundaryOrder::type boundary_order, + const std::vector& page_indices, const std::vector& null_pages, + const std::vector& min_values, const std::vector& max_values, + bool has_null_counts = false, const std::vector& null_counts = {}) { + std::string dir_string(parquet::test::get_data_dir()); + std::string path = dir_string + "/alltypes_tiny_pages.parquet"; + auto reader = ParquetFileReader::OpenFile(path, false); + auto file_metadata = reader->metadata(); + + // Get column index location to a specific column chunk. + const int row_group_id = 0; + ASSERT_LT(row_group_id, file_metadata->num_row_groups()); + ASSERT_LT(column_id, file_metadata->num_columns()); + auto index_location = file_metadata->RowGroup(row_group_id) + ->ColumnChunk(column_id) + ->GetColumnIndexLocation(); + ASSERT_TRUE(index_location.has_value()); + + // Read serialized column index from the file. + std::shared_ptr<::arrow::io::RandomAccessFile> source; + PARQUET_ASSIGN_OR_THROW(source, ::arrow::io::ReadableFile::Open(path)); + PARQUET_ASSIGN_OR_THROW(auto buffer, + source->ReadAt(index_location->offset, index_location->length)); + PARQUET_THROW_NOT_OK(source->Close()); + + // Deserialize column index. + auto properties = default_reader_properties(); + auto descr = file_metadata->schema()->Column(column_id); + std::unique_ptr column_index = ColumnIndex::Make( + *descr, buffer->data(), static_cast(buffer->size()), properties); + auto typed_column_index = dynamic_cast*>(column_index.get()); + ASSERT_TRUE(typed_column_index != nullptr); + + // Verify only partial data as there are too many pages. + ASSERT_EQ(num_pages, column_index->null_pages().size()); + ASSERT_EQ(has_null_counts, column_index->has_null_counts()); + ASSERT_EQ(boundary_order, column_index->boundary_order()); + for (size_t i = 0; i < page_indices.size(); ++i) { + size_t page_id = page_indices.at(i); + ASSERT_EQ(null_pages.at(i), column_index->null_pages().at(page_id)); + if (has_null_counts) { + ASSERT_EQ(null_counts.at(i), column_index->null_counts().at(page_id)); + } + // min/max values are only meaningful for non-null pages. + if (!null_pages.at(i)) { + if constexpr (std::is_same_v) { + ASSERT_DOUBLE_EQ(min_values.at(i), typed_column_index->min_values().at(page_id)); + ASSERT_DOUBLE_EQ(max_values.at(i), typed_column_index->max_values().at(page_id)); + } else if constexpr (std::is_same_v) { + ASSERT_FLOAT_EQ(min_values.at(i), typed_column_index->min_values().at(page_id)); + ASSERT_FLOAT_EQ(max_values.at(i), typed_column_index->max_values().at(page_id)); + } else { + ASSERT_EQ(min_values.at(i), typed_column_index->min_values().at(page_id)); + ASSERT_EQ(max_values.at(i), typed_column_index->max_values().at(page_id)); + } + } + } +} + +TEST(PageIndex, ReadInt64ColumnIndex) { + const int column_id = 5; + const size_t num_pages = 528; + const BoundaryOrder::type boundary_order = BoundaryOrder::Unordered; + const std::vector page_indices = {0, 99, 426, 520}; + const std::vector null_pages = {false, false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {0, 0, 0, 0}; + const std::vector min_values = {0, 10, 0, 0}; + const std::vector max_values = {90, 90, 80, 70}; + + TestReadTypedColumnIndex(column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, + null_counts); +} + +TEST(PageIndex, ReadDoubleColumnIndex) { + const int column_id = 7; + const size_t num_pages = 528; + const BoundaryOrder::type boundary_order = BoundaryOrder::Unordered; + const std::vector page_indices = {0, 51, 212, 527}; + const std::vector null_pages = {false, false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {0, 0, 0, 0}; + const std::vector min_values = {-0, 30.3, 10.1, 40.4}; + const std::vector max_values = {90.9, 90.9, 90.9, 60.6}; + + TestReadTypedColumnIndex(column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, + has_null_counts, null_counts); +} + +TEST(PageIndex, ByteArrayColumnIndex) { + const int column_id = 9; + const size_t num_pages = 352; + const BoundaryOrder::type boundary_order = BoundaryOrder::Ascending; + const std::vector page_indices = {0, 128, 256}; + const std::vector null_pages = {false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {0, 0, 0}; + + // All min values are "0" and max values are "9". + const std::string_view min_value = "0"; + const std::string_view max_value = "9"; + const std::vector min_values = {ByteArray{min_value}, ByteArray{min_value}, + ByteArray{min_value}}; + const std::vector max_values = {ByteArray{max_value}, ByteArray{max_value}, + ByteArray{max_value}}; + + TestReadTypedColumnIndex(column_id, num_pages, boundary_order, + page_indices, null_pages, min_values, + max_values, has_null_counts, null_counts); +} + +TEST(PageIndex, ReadBoolColumnIndex) { + const int column_id = 1; + const size_t num_pages = 82; + const BoundaryOrder::type boundary_order = BoundaryOrder::Ascending; + const std::vector page_indices = {0, 16, 64}; + const std::vector null_pages = {false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {0, 0, 0}; + const std::vector min_values = {false, false, false}; + const std::vector max_values = {true, true, true}; + + TestReadTypedColumnIndex(column_id, num_pages, boundary_order, + page_indices, null_pages, min_values, max_values, + has_null_counts, null_counts); +} + +} // namespace parquet From 3d3ec3029e92097a89ae766b92c57512c584a526 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 7 Dec 2022 23:46:04 +0800 Subject: [PATCH 13/19] fix signature of Decode --- cpp/src/parquet/page_index.cc | 26 ++++++++++---------- cpp/src/parquet/page_index_test.cc | 39 ++++++++++++++++-------------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 6ca3bc0d5ba..18537010438 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -34,22 +34,22 @@ namespace { template void Decode(std::unique_ptr::Decoder>& decoder, - const std::string& input, std::vector& output, - size_t index) { - if (ARROW_PREDICT_FALSE(index >= output.size())) { + const std::string& input, std::vector* output, + size_t output_index) { + if (ARROW_PREDICT_FALSE(output_index >= output->size())) { throw ParquetException("Index out of bound"); } decoder->SetData(/*num_values=*/1, reinterpret_cast(input.c_str()), static_cast(input.size())); - decoder->Decode(&output[index], /*max_values=*/1); + decoder->Decode(&output->at(output_index), /*max_values=*/1); } template <> void Decode(std::unique_ptr& decoder, - const std::string& input, std::vector& output, - size_t index) { - if (ARROW_PREDICT_FALSE(index >= output.size())) { + const std::string& input, std::vector* output, + size_t output_index) { + if (ARROW_PREDICT_FALSE(output_index >= output->size())) { throw ParquetException("Index out of bound"); } @@ -57,13 +57,13 @@ void Decode(std::unique_ptr& decoder, decoder->SetData(/*num_values=*/1, reinterpret_cast(input.c_str()), static_cast(input.size())); decoder->Decode(&value, /*max_values=*/1); - output[index] = value; + output->at(output_index) = value; } template <> void Decode(std::unique_ptr&, const std::string& input, - std::vector& output, size_t index) { - if (ARROW_PREDICT_FALSE(index >= output.size())) { + std::vector* output, size_t output_index) { + if (ARROW_PREDICT_FALSE(output_index >= output->size())) { throw ParquetException("Index out of bound"); } @@ -72,7 +72,7 @@ void Decode(std::unique_ptr&, const std::string throw ParquetException("Invalid encoded byte array length"); } - auto& decoded = output.at(index); + auto& decoded = output->at(output_index); decoded.len = static_cast(input.size()); decoded.ptr = reinterpret_cast(input.data()); } @@ -113,8 +113,8 @@ class TypedColumnIndexImpl : public TypedColumnIndex { if (!column_index_.null_pages[i]) { // The check on `num_pages` has guaranteed the cast below is safe. non_null_page_indices_.emplace_back(static_cast(i)); - Decode(plain_decoder, column_index_.min_values[i], min_values_, i); - Decode(plain_decoder, column_index_.max_values[i], max_values_, i); + Decode(plain_decoder, column_index_.min_values[i], &min_values_, i); + Decode(plain_decoder, column_index_.max_values[i], &max_values_, i); } } DCHECK_EQ(num_non_null_pages, non_null_page_indices_.size()); diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 3c2b52bbbab..2492896453c 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -75,13 +75,16 @@ TEST(PageIndex, ReadOffsetIndex) { } template -void TestReadTypedColumnIndex( - int column_id, size_t num_pages, BoundaryOrder::type boundary_order, - const std::vector& page_indices, const std::vector& null_pages, - const std::vector& min_values, const std::vector& max_values, - bool has_null_counts = false, const std::vector& null_counts = {}) { +void TestReadTypedColumnIndex(const std::string& file_name, int column_id, + size_t num_pages, BoundaryOrder::type boundary_order, + const std::vector& page_indices, + const std::vector& null_pages, + const std::vector& min_values, + const std::vector& max_values, + bool has_null_counts = false, + const std::vector& null_counts = {}) { std::string dir_string(parquet::test::get_data_dir()); - std::string path = dir_string + "/alltypes_tiny_pages.parquet"; + std::string path = dir_string + "/" + file_name; auto reader = ParquetFileReader::OpenFile(path, false); auto file_metadata = reader->metadata(); @@ -146,9 +149,9 @@ TEST(PageIndex, ReadInt64ColumnIndex) { const std::vector min_values = {0, 10, 0, 0}; const std::vector max_values = {90, 90, 80, 70}; - TestReadTypedColumnIndex(column_id, num_pages, boundary_order, page_indices, - null_pages, min_values, max_values, has_null_counts, - null_counts); + TestReadTypedColumnIndex( + "alltypes_tiny_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); } TEST(PageIndex, ReadDoubleColumnIndex) { @@ -162,9 +165,9 @@ TEST(PageIndex, ReadDoubleColumnIndex) { const std::vector min_values = {-0, 30.3, 10.1, 40.4}; const std::vector max_values = {90.9, 90.9, 90.9, 60.6}; - TestReadTypedColumnIndex(column_id, num_pages, boundary_order, page_indices, - null_pages, min_values, max_values, - has_null_counts, null_counts); + TestReadTypedColumnIndex( + "alltypes_tiny_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); } TEST(PageIndex, ByteArrayColumnIndex) { @@ -184,9 +187,9 @@ TEST(PageIndex, ByteArrayColumnIndex) { const std::vector max_values = {ByteArray{max_value}, ByteArray{max_value}, ByteArray{max_value}}; - TestReadTypedColumnIndex(column_id, num_pages, boundary_order, - page_indices, null_pages, min_values, - max_values, has_null_counts, null_counts); + TestReadTypedColumnIndex( + "alltypes_tiny_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); } TEST(PageIndex, ReadBoolColumnIndex) { @@ -200,9 +203,9 @@ TEST(PageIndex, ReadBoolColumnIndex) { const std::vector min_values = {false, false, false}; const std::vector max_values = {true, true, true}; - TestReadTypedColumnIndex(column_id, num_pages, boundary_order, - page_indices, null_pages, min_values, max_values, - has_null_counts, null_counts); + TestReadTypedColumnIndex( + "alltypes_tiny_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); } } // namespace parquet From b3e577f3d4944967ea76843ee5a30edc7423b380 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 8 Dec 2022 11:15:16 +0800 Subject: [PATCH 14/19] rebase and add test to cover null_page = true --- cpp/src/parquet/page_index_test.cc | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 2492896453c..75ae2edfddf 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -208,4 +208,21 @@ TEST(PageIndex, ReadBoolColumnIndex) { null_pages, min_values, max_values, has_null_counts, null_counts); } +TEST(PageIndex, ReadColumnIndexWithNullPage) { + const int column_id = 0; + const size_t num_pages = 2; + const BoundaryOrder::type boundary_order = BoundaryOrder::Ascending; + const std::vector page_indices = {0, 1}; + const std::vector null_pages = {true, true}; + // It seems that the null_counts are malformed. + const bool has_null_counts = true; + const std::vector null_counts = {-1, -1}; + const std::vector min_values = {}; + const std::vector max_values = {}; + + TestReadTypedColumnIndex( + "datapage_v1-corrupt-checksum.parquet", column_id, num_pages, boundary_order, + page_indices, null_pages, min_values, max_values, has_null_counts, null_counts); +} + } // namespace parquet From e14ff24d0abd601ee7d0a4a1de9823e269b71a4d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 8 Dec 2022 13:19:25 +0800 Subject: [PATCH 15/19] update parquet-testing submodule --- cpp/submodules/parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index e13af117de7..3510fa8d34b 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit e13af117de7c4f0a4d9908ae3827b3ab119868f3 +Subproject commit 3510fa8d34ba577f36f399d4642f9e1ccdf18b30 From 3e0aeed046f4db0448f17ecf5c8c4332771c7394 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 9 Dec 2022 10:32:19 +0800 Subject: [PATCH 16/19] add test for FLBA type --- cpp/src/parquet/page_index.cc | 8 +++++--- cpp/src/parquet/page_index_test.cc | 32 ++++++++++++++++++++++++++++++ cpp/submodules/parquet-testing | 2 +- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 18537010438..1684c10b6ec 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -86,10 +86,12 @@ class TypedColumnIndexImpl : public TypedColumnIndex { const format::ColumnIndex& column_index) : column_index_(column_index) { // Make sure the number of pages is valid and it does not overflow to int32_t. - if (column_index_.null_pages.size() != column_index_.min_values.size() || + if (ARROW_PREDICT_FALSE(column_index_.null_pages.size() >= + static_cast(std::numeric_limits::max())) || + column_index_.null_pages.size() != column_index_.min_values.size() || column_index_.min_values.size() != column_index_.max_values.size() || - ARROW_PREDICT_FALSE(column_index_.null_pages.size() >= - static_cast(std::numeric_limits::max()))) { + (column_index_.__isset.null_counts && + column_index_.null_counts.size() != column_index_.null_pages.size())) { throw ParquetException("Invalid column index"); } diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 75ae2edfddf..7e9a944ec8a 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -130,6 +130,12 @@ void TestReadTypedColumnIndex(const std::string& file_name, int column_id, } else if constexpr (std::is_same_v) { ASSERT_FLOAT_EQ(min_values.at(i), typed_column_index->min_values().at(page_id)); ASSERT_FLOAT_EQ(max_values.at(i), typed_column_index->max_values().at(page_id)); + } else if constexpr (std::is_same_v) { + auto len = descr->type_length(); + ASSERT_EQ(0, ::memcmp(min_values.at(i).ptr, + typed_column_index->min_values().at(page_id).ptr, len)); + ASSERT_EQ(0, ::memcmp(max_values.at(i).ptr, + typed_column_index->max_values().at(page_id).ptr, len)); } else { ASSERT_EQ(min_values.at(i), typed_column_index->min_values().at(page_id)); ASSERT_EQ(max_values.at(i), typed_column_index->max_values().at(page_id)); @@ -208,6 +214,32 @@ TEST(PageIndex, ReadBoolColumnIndex) { null_pages, min_values, max_values, has_null_counts, null_counts); } +namespace { +FLBA toFLBA(const char* ptr) { return FLBA{reinterpret_cast(ptr)}; } +} // namespace + +TEST(PageIndex, FixedLengthByteArrayColumnIndex) { + const int column_id = 0; + const size_t num_pages = 10; + const BoundaryOrder::type boundary_order = BoundaryOrder::Descending; + const std::vector page_indices = {0, 4, 8}; + const std::vector null_pages = {false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {9, 13, 9}; + const std::vector min_literals = {"\x00\x00\x03\x85", "\x00\x00\x01\xF5", + "\x00\x00\x00\x65"}; + const std::vector max_literals = {"\x00\x00\x03\xE8", "\x00\x00\x02\x58", + "\x00\x00\x00\xC8"}; + const std::vector min_values = {toFLBA(min_literals[0]), toFLBA(min_literals[1]), + toFLBA(min_literals[2])}; + const std::vector max_values = {toFLBA(max_literals[0]), toFLBA(max_literals[1]), + toFLBA(max_literals[2])}; + + TestReadTypedColumnIndex( + "fixed_length_byte_array.parquet", column_id, num_pages, boundary_order, + page_indices, null_pages, min_values, max_values, has_null_counts, null_counts); +} + TEST(PageIndex, ReadColumnIndexWithNullPage) { const int column_id = 0; const size_t num_pages = 2; diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 3510fa8d34b..de7570a865a 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 3510fa8d34ba577f36f399d4642f9e1ccdf18b30 +Subproject commit de7570a865af017add78432e4c045912c213ae24 From 3fa5d62e9b76b1fd071d605ec78e3fb6c16a6bd9 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 12 Dec 2022 15:00:13 +0100 Subject: [PATCH 17/19] Nits --- cpp/src/parquet/page_index.cc | 33 ++++++++++++++++-------------- cpp/src/parquet/page_index_test.cc | 20 +++++++++--------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 1684c10b6ec..749181fff24 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -25,7 +25,6 @@ #include "arrow/util/unreachable.h" #include -#include #include namespace parquet { @@ -42,7 +41,10 @@ void Decode(std::unique_ptr::Decoder>& decoder, decoder->SetData(/*num_values=*/1, reinterpret_cast(input.c_str()), static_cast(input.size())); - decoder->Decode(&output->at(output_index), /*max_values=*/1); + const auto num_values = decoder->Decode(&output->at(output_index), /*max_values=*/1); + if (ARROW_PREDICT_FALSE(num_values != 1)) { + throw ParquetException("Could not decode statistics value"); + } } template <> @@ -56,7 +58,10 @@ void Decode(std::unique_ptr& decoder, bool value; decoder->SetData(/*num_values=*/1, reinterpret_cast(input.c_str()), static_cast(input.size())); - decoder->Decode(&value, /*max_values=*/1); + const auto num_values = decoder->Decode(&value, /*max_values=*/1); + if (ARROW_PREDICT_FALSE(num_values != 1)) { + throw ParquetException("Could not decode statistics value"); + } output->at(output_index) = value; } @@ -72,9 +77,8 @@ void Decode(std::unique_ptr&, const std::string throw ParquetException("Invalid encoded byte array length"); } - auto& decoded = output->at(output_index); - decoded.len = static_cast(input.size()); - decoded.ptr = reinterpret_cast(input.data()); + output->at(output_index) = {/*len=*/static_cast(input.size()), + /*ptr=*/reinterpret_cast(input.data())}; } template @@ -86,17 +90,16 @@ class TypedColumnIndexImpl : public TypedColumnIndex { const format::ColumnIndex& column_index) : column_index_(column_index) { // Make sure the number of pages is valid and it does not overflow to int32_t. - if (ARROW_PREDICT_FALSE(column_index_.null_pages.size() >= - static_cast(std::numeric_limits::max())) || - column_index_.null_pages.size() != column_index_.min_values.size() || - column_index_.min_values.size() != column_index_.max_values.size() || + const size_t num_pages = column_index_.null_pages.size(); + if (num_pages >= static_cast(std::numeric_limits::max()) || + column_index_.min_values.size() != num_pages || + column_index_.max_values.size() != num_pages || (column_index_.__isset.null_counts && - column_index_.null_counts.size() != column_index_.null_pages.size())) { + column_index_.null_counts.size() != num_pages)) { throw ParquetException("Invalid column index"); } - size_t num_pages = column_index_.null_pages.size(); - size_t num_non_null_pages = static_cast(std::accumulate( + const size_t num_non_null_pages = static_cast(std::accumulate( column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0, [](int32_t num_non_null_pages, bool null_page) { return num_non_null_pages + (null_page ? 0 : 1); @@ -104,8 +107,8 @@ class TypedColumnIndexImpl : public TypedColumnIndex { DCHECK_LE(num_non_null_pages, num_pages); // Allocate slots for decoded values. - min_values_.resize(num_pages); - max_values_.resize(num_pages); + min_values_.resize(num_non_null_pages); + max_values_.resize(num_non_null_pages); non_null_page_indices_.reserve(num_non_null_pages); // Decode min and max values according to the physical type. diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 7e9a944ec8a..df64d122203 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -176,7 +176,7 @@ TEST(PageIndex, ReadDoubleColumnIndex) { null_pages, min_values, max_values, has_null_counts, null_counts); } -TEST(PageIndex, ByteArrayColumnIndex) { +TEST(PageIndex, ReadByteArrayColumnIndex) { const int column_id = 9; const size_t num_pages = 352; const BoundaryOrder::type boundary_order = BoundaryOrder::Ascending; @@ -214,11 +214,11 @@ TEST(PageIndex, ReadBoolColumnIndex) { null_pages, min_values, max_values, has_null_counts, null_counts); } -namespace { -FLBA toFLBA(const char* ptr) { return FLBA{reinterpret_cast(ptr)}; } -} // namespace - TEST(PageIndex, FixedLengthByteArrayColumnIndex) { + auto to_flba = [](const char* ptr) { + return FLBA{reinterpret_cast(ptr)}; + }; + const int column_id = 0; const size_t num_pages = 10; const BoundaryOrder::type boundary_order = BoundaryOrder::Descending; @@ -230,10 +230,10 @@ TEST(PageIndex, FixedLengthByteArrayColumnIndex) { "\x00\x00\x00\x65"}; const std::vector max_literals = {"\x00\x00\x03\xE8", "\x00\x00\x02\x58", "\x00\x00\x00\xC8"}; - const std::vector min_values = {toFLBA(min_literals[0]), toFLBA(min_literals[1]), - toFLBA(min_literals[2])}; - const std::vector max_values = {toFLBA(max_literals[0]), toFLBA(max_literals[1]), - toFLBA(max_literals[2])}; + const std::vector min_values = { + to_flba(min_literals[0]), to_flba(min_literals[1]), to_flba(min_literals[2])}; + const std::vector max_values = { + to_flba(max_literals[0]), to_flba(max_literals[1]), to_flba(max_literals[2])}; TestReadTypedColumnIndex( "fixed_length_byte_array.parquet", column_id, num_pages, boundary_order, @@ -253,7 +253,7 @@ TEST(PageIndex, ReadColumnIndexWithNullPage) { const std::vector max_values = {}; TestReadTypedColumnIndex( - "datapage_v1-corrupt-checksum.parquet", column_id, num_pages, boundary_order, + "datapage_v1-uncompressed-checksum.parquet", column_id, num_pages, boundary_order, page_indices, null_pages, min_values, max_values, has_null_counts, null_counts); } From 084aad1fca7d5c179db351cdc9e61bc4ff2c780d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 13 Dec 2022 12:41:26 +0800 Subject: [PATCH 18/19] test int32_with_null_pages.parquet --- cpp/src/parquet/page_index.cc | 4 ++-- cpp/src/parquet/page_index_test.cc | 21 ++++++++++----------- cpp/submodules/parquet-testing | 2 +- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 749181fff24..559d3659882 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -107,8 +107,8 @@ class TypedColumnIndexImpl : public TypedColumnIndex { DCHECK_LE(num_non_null_pages, num_pages); // Allocate slots for decoded values. - min_values_.resize(num_non_null_pages); - max_values_.resize(num_non_null_pages); + min_values_.resize(num_pages); + max_values_.resize(num_pages); non_null_page_indices_.reserve(num_non_null_pages); // Decode min and max values according to the physical type. diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index df64d122203..6d1cdc2c97a 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -214,7 +214,7 @@ TEST(PageIndex, ReadBoolColumnIndex) { null_pages, min_values, max_values, has_null_counts, null_counts); } -TEST(PageIndex, FixedLengthByteArrayColumnIndex) { +TEST(PageIndex, ReadFixedLengthByteArrayColumnIndex) { auto to_flba = [](const char* ptr) { return FLBA{reinterpret_cast(ptr)}; }; @@ -242,19 +242,18 @@ TEST(PageIndex, FixedLengthByteArrayColumnIndex) { TEST(PageIndex, ReadColumnIndexWithNullPage) { const int column_id = 0; - const size_t num_pages = 2; - const BoundaryOrder::type boundary_order = BoundaryOrder::Ascending; - const std::vector page_indices = {0, 1}; - const std::vector null_pages = {true, true}; - // It seems that the null_counts are malformed. + const size_t num_pages = 10; + const BoundaryOrder::type boundary_order = BoundaryOrder::Unordered; + const std::vector page_indices = {2, 4, 8}; + const std::vector null_pages = {true, false, false}; const bool has_null_counts = true; - const std::vector null_counts = {-1, -1}; - const std::vector min_values = {}; - const std::vector max_values = {}; + const std::vector null_counts = {100, 16, 8}; + const std::vector min_values = {0, -2048691758, -2046900272}; + const std::vector max_values = {0, 2143189382, 2087168549}; TestReadTypedColumnIndex( - "datapage_v1-uncompressed-checksum.parquet", column_id, num_pages, boundary_order, - page_indices, null_pages, min_values, max_values, has_null_counts, null_counts); + "int32_with_null_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); } } // namespace parquet diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index de7570a865a..8a3d3fd5ff7 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit de7570a865af017add78432e4c045912c213ae24 +Subproject commit 8a3d3fd5ff7691ee07ca9802df66290a3106e4b7 From 6c3748e45b08a1336a656e88e07a4b5fb2ce6b78 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 13 Dec 2022 16:35:40 +0100 Subject: [PATCH 19/19] Add cursory doc --- docs/source/cpp/parquet.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 5de5799e0d8..23a9657fd41 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -567,3 +567,17 @@ More specifically, Parquet C++ supports: supported. * EncryptionWithFooterKey and EncryptionWithColumnKey modes. * Encrypted Footer and Plaintext Footer modes. + +Miscellaneous +------------- + ++--------------------------+----------+----------+---------+ +| Feature | Reading | Writing | Notes | ++==========================+==========+==========+=========+ +| Column Index | ✓ | | \(1) | ++--------------------------+----------+----------+---------+ +| Offset Index | ✓ | | \(1) | ++--------------------------+----------+----------+---------+ + +* \(1) Access to the Column and Offset Index structures is provided, but + data read APIs do not currently make any use of them.