diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 17e6fcda721..46df502decd 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -166,6 +166,7 @@ set(PARQUET_SRCS level_conversion.cc metadata.cc murmur3.cc + page_index.cc "${ARROW_SOURCE_DIR}/src/generated/parquet_constants.cpp" "${ARROW_SOURCE_DIR}/src/generated/parquet_types.cpp" platform.cc @@ -325,6 +326,7 @@ add_parquet_test(internals-test statistics_test.cc encoding_test.cc metadata_test.cc + page_index_test.cc public_api_test.cc types_test.cc test_util.cc) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1e1f96d906a..c39dc706b8e 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -312,7 +312,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } } - std::optional GetColumIndexLocation() const { + std::optional GetColumnIndexLocation() const { if (column_->__isset.column_index_offset && column_->__isset.column_index_length) { return IndexLocation{column_->column_index_offset, column_->column_index_length}; } @@ -434,8 +434,8 @@ std::unique_ptr ColumnChunkMetaData::crypto_metadata() con return impl_->crypto_metadata(); } -std::optional ColumnChunkMetaData::GetColumIndexLocation() const { - return impl_->GetColumIndexLocation(); +std::optional ColumnChunkMetaData::GetColumnIndexLocation() const { + return impl_->GetColumnIndexLocation(); } std::optional ColumnChunkMetaData::GetOffsetIndexLocation() const { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 8c619c5c63b..40ff2aacc88 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -179,7 +179,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; std::unique_ptr crypto_metadata() const; - std::optional GetColumIndexLocation() const; + std::optional GetColumnIndexLocation() const; std::optional GetOffsetIndexLocation() const; private: diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index cabfb8078cf..a0989ad73e3 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -314,7 +314,7 @@ TEST(Metadata, TestReadPageIndex) { 5280, 9735, 3521, 10545, 3251, 3251}; for (int i = 0; i < row_group_metadata->num_columns(); ++i) { auto col_chunk_metadata = row_group_metadata->ColumnChunk(i); - auto ci_location = col_chunk_metadata->GetColumIndexLocation(); + auto ci_location = col_chunk_metadata->GetColumnIndexLocation(); if (i == 10) { // column_id 10 does not have column index ASSERT_FALSE(ci_location.has_value()); diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc new file mode 100644 index 00000000000..559d3659882 --- /dev/null +++ b/cpp/src/parquet/page_index.cc @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include "arrow/util/unreachable.h" + +#include +#include + +namespace parquet { + +namespace { + +template +void Decode(std::unique_ptr::Decoder>& decoder, + const std::string& input, std::vector* output, + size_t output_index) { + if (ARROW_PREDICT_FALSE(output_index >= output->size())) { + throw ParquetException("Index out of bound"); + } + + decoder->SetData(/*num_values=*/1, reinterpret_cast(input.c_str()), + static_cast(input.size())); + const auto num_values = decoder->Decode(&output->at(output_index), /*max_values=*/1); + if (ARROW_PREDICT_FALSE(num_values != 1)) { + throw ParquetException("Could not decode statistics value"); + } +} + +template <> +void Decode(std::unique_ptr& decoder, + const std::string& input, std::vector* output, + size_t output_index) { + if (ARROW_PREDICT_FALSE(output_index >= output->size())) { + throw ParquetException("Index out of bound"); + } + + bool value; + decoder->SetData(/*num_values=*/1, reinterpret_cast(input.c_str()), + static_cast(input.size())); + const auto num_values = decoder->Decode(&value, /*max_values=*/1); + if (ARROW_PREDICT_FALSE(num_values != 1)) { + throw ParquetException("Could not decode statistics value"); + } + output->at(output_index) = value; +} + +template <> +void Decode(std::unique_ptr&, const std::string& input, + std::vector* output, size_t output_index) { + if (ARROW_PREDICT_FALSE(output_index >= output->size())) { + throw ParquetException("Index out of bound"); + } + + if (ARROW_PREDICT_FALSE(input.size() > + static_cast(std::numeric_limits::max()))) { + throw ParquetException("Invalid encoded byte array length"); + } + + output->at(output_index) = {/*len=*/static_cast(input.size()), + /*ptr=*/reinterpret_cast(input.data())}; +} + +template +class TypedColumnIndexImpl : public TypedColumnIndex { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + // Make sure the number of pages is valid and it does not overflow to int32_t. + const size_t num_pages = column_index_.null_pages.size(); + if (num_pages >= static_cast(std::numeric_limits::max()) || + column_index_.min_values.size() != num_pages || + column_index_.max_values.size() != num_pages || + (column_index_.__isset.null_counts && + column_index_.null_counts.size() != num_pages)) { + throw ParquetException("Invalid column index"); + } + + const size_t num_non_null_pages = static_cast(std::accumulate( + column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0, + [](int32_t num_non_null_pages, bool null_page) { + return num_non_null_pages + (null_page ? 0 : 1); + })); + DCHECK_LE(num_non_null_pages, num_pages); + + // Allocate slots for decoded values. + min_values_.resize(num_pages); + max_values_.resize(num_pages); + non_null_page_indices_.reserve(num_non_null_pages); + + // Decode min and max values according to the physical type. + // Note that null page are skipped. + auto plain_decoder = MakeTypedDecoder(Encoding::PLAIN, &descr); + for (size_t i = 0; i < num_pages; ++i) { + if (!column_index_.null_pages[i]) { + // The check on `num_pages` has guaranteed the cast below is safe. + non_null_page_indices_.emplace_back(static_cast(i)); + Decode(plain_decoder, column_index_.min_values[i], &min_values_, i); + Decode(plain_decoder, column_index_.max_values[i], &max_values_, i); + } + } + DCHECK_EQ(num_non_null_pages, non_null_page_indices_.size()); + } + + const std::vector& null_pages() const override { + return column_index_.null_pages; + } + + const std::vector& encoded_min_values() const override { + return column_index_.min_values; + } + + const std::vector& encoded_max_values() const override { + return column_index_.max_values; + } + + BoundaryOrder::type boundary_order() const override { + return LoadEnumSafe(&column_index_.boundary_order); + } + + bool has_null_counts() const override { return column_index_.__isset.null_counts; } + + const std::vector& null_counts() const override { + return column_index_.null_counts; + } + + const std::vector& non_null_page_indices() const override { + return non_null_page_indices_; + } + + const std::vector& min_values() const override { return min_values_; } + + const std::vector& max_values() const override { return max_values_; } + + private: + /// Wrapped thrift column index. + const format::ColumnIndex column_index_; + /// Decoded typed min/max values. Undefined for null pages. + std::vector min_values_; + std::vector max_values_; + /// A list of page indices for non-null pages. + std::vector non_null_page_indices_; +}; + +class OffsetIndexImpl : public OffsetIndex { + public: + explicit OffsetIndexImpl(const format::OffsetIndex& offset_index) { + page_locations_.reserve(offset_index.page_locations.size()); + for (const auto& page_location : offset_index.page_locations) { + page_locations_.emplace_back(PageLocation{page_location.offset, + page_location.compressed_page_size, + page_location.first_row_index}); + } + } + + const std::vector& page_locations() const override { + return page_locations_; + } + + private: + std::vector page_locations_; +}; + +} // namespace + +// ---------------------------------------------------------------------- +// Public factory functions + +std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties) { + format::ColumnIndex column_index; + ThriftDeserializer deserializer(properties); + deserializer.DeserializeMessage(reinterpret_cast(serialized_index), + &index_len, &column_index); + switch (descr.physical_type()) { + case Type::BOOLEAN: + return std::make_unique>(descr, column_index); + case Type::INT32: + return std::make_unique>(descr, column_index); + case Type::INT64: + return std::make_unique>(descr, column_index); + case Type::INT96: + return std::make_unique>(descr, column_index); + case Type::FLOAT: + return std::make_unique>(descr, column_index); + case Type::DOUBLE: + return std::make_unique>(descr, column_index); + case Type::BYTE_ARRAY: + return std::make_unique>(descr, column_index); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_unique>(descr, column_index); + case Type::UNDEFINED: + return nullptr; + } + ::arrow::Unreachable("Cannot make ColumnIndex of an unknown type"); + return nullptr; +} + +std::unique_ptr OffsetIndex::Make(const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties) { + format::OffsetIndex offset_index; + ThriftDeserializer deserializer(properties); + deserializer.DeserializeMessage(reinterpret_cast(serialized_index), + &index_len, &offset_index); + return std::make_unique(offset_index); +} + +} // namespace parquet diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h new file mode 100644 index 00000000000..13dae40f56c --- /dev/null +++ b/cpp/src/parquet/page_index.h @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/types.h" + +#include + +namespace parquet { + +class ColumnDescriptor; +class ReaderProperties; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief A bitmap with a bit set for each data page that has only null values. + /// + /// The length of this vector is equal to the number of data pages in the column. + virtual const std::vector& null_pages() const = 0; + + /// \brief A vector of encoded lower bounds for each data page in this column. + /// + /// `null_pages` should be inspected first, as only pages with non-null values + /// may have their lower bounds populated. + virtual const std::vector& encoded_min_values() const = 0; + + /// \brief A vector of encoded upper bounds for each data page in this column. + /// + /// `null_pages` should be inspected first, as only pages with non-null values + /// may have their upper bounds populated. + virtual const std::vector& encoded_max_values() const = 0; + + /// \brief The ordering of lower and upper bounds. + /// + /// The boundary order applies accross all lower bounds, and all upper bounds, + /// respectively. However, the order between lower bounds and upper bounds + /// cannot be derived from this. + virtual BoundaryOrder::type boundary_order() const = 0; + + /// \brief Whether per-page null count information is available. + virtual bool has_null_counts() const = 0; + + /// \brief An optional vector with the number of null values in each data page. + /// + /// `has_null_counts` should be called first to determine if this information is + /// available. + virtual const std::vector& null_counts() const = 0; + + /// \brief A vector of page indices for non-null pages. + virtual const std::vector& non_null_page_indices() const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief A vector of lower bounds for each data page in this column. + /// + /// This is like `encoded_min_values`, but with the values decoded according to + /// the column's physical type. + /// `min_values` and `max_values` can be used together with `boundary_order` + /// in order to prune some data pages when searching for specific values. + virtual const std::vector& min_values() const = 0; + + /// \brief A vector of upper bounds for each data page in this column. + /// + /// Just like `min_values`, but for upper bounds instead of lower bounds. + virtual const std::vector& max_values() const = 0; +}; + +using BoolColumnIndex = TypedColumnIndex; +using Int32ColumnIndex = TypedColumnIndex; +using Int64ColumnIndex = TypedColumnIndex; +using FloatColumnIndex = TypedColumnIndex; +using DoubleColumnIndex = TypedColumnIndex; +using ByteArrayColumnIndex = TypedColumnIndex; +using FLBAColumnIndex = TypedColumnIndex; + +/// \brief PageLocation is a proxy around format::PageLocation. +struct PARQUET_EXPORT PageLocation { + /// File offset of the data page. + int64_t offset; + /// Total compressed size of the data page and header. + int32_t compressed_page_size; + /// Row id of the first row in the page within the row group. + int64_t first_row_index; +}; + +/// \brief OffsetIndex is a proxy around format::OffsetIndex. +class PARQUET_EXPORT OffsetIndex { + public: + /// \brief Create a OffsetIndex from a serialized thrift message. + static std::unique_ptr Make(const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~OffsetIndex() = default; + + /// \brief A vector of locations for each data page in this column. + virtual const std::vector& page_locations() const = 0; +}; + +} // namespace parquet diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc new file mode 100644 index 00000000000..6d1cdc2c97a --- /dev/null +++ b/cpp/src/parquet/page_index_test.cc @@ -0,0 +1,259 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" + +#include + +#include "arrow/io/file.h" +#include "parquet/file_reader.h" +#include "parquet/schema.h" +#include "parquet/test_util.h" +#include "parquet/thrift_internal.h" + +namespace parquet { + +TEST(PageIndex, ReadOffsetIndex) { + std::string dir_string(parquet::test::get_data_dir()); + std::string path = dir_string + "/alltypes_tiny_pages.parquet"; + auto reader = ParquetFileReader::OpenFile(path, false); + auto file_metadata = reader->metadata(); + + // Get offset index location to column 0 of row group 0. + const int row_group_id = 0; + const int column_id = 0; + ASSERT_LT(row_group_id, file_metadata->num_row_groups()); + ASSERT_LT(column_id, file_metadata->num_columns()); + auto index_location = file_metadata->RowGroup(row_group_id) + ->ColumnChunk(column_id) + ->GetOffsetIndexLocation(); + ASSERT_TRUE(index_location.has_value()); + + // Read serialized offset index from the file. + std::shared_ptr<::arrow::io::RandomAccessFile> source; + PARQUET_ASSIGN_OR_THROW(source, ::arrow::io::ReadableFile::Open(path)); + PARQUET_ASSIGN_OR_THROW(auto buffer, + source->ReadAt(index_location->offset, index_location->length)); + PARQUET_THROW_NOT_OK(source->Close()); + + // Deserialize offset index. + auto properties = default_reader_properties(); + std::unique_ptr offset_index = OffsetIndex::Make( + buffer->data(), static_cast(buffer->size()), properties); + + // Verify only partial data as it contains 325 pages in total. + const size_t num_pages = 325; + const std::vector page_indices = {0, 100, 200, 300}; + const std::vector page_locations = { + PageLocation{4, 109, 0}, PageLocation{11480, 133, 2244}, + PageLocation{22980, 133, 4494}, PageLocation{34480, 133, 6744}}; + + ASSERT_EQ(num_pages, offset_index->page_locations().size()); + for (size_t i = 0; i < page_indices.size(); ++i) { + size_t page_id = page_indices.at(i); + const auto& read_page_location = offset_index->page_locations().at(page_id); + const auto& expected_page_location = page_locations.at(i); + ASSERT_EQ(expected_page_location.offset, read_page_location.offset); + ASSERT_EQ(expected_page_location.compressed_page_size, + read_page_location.compressed_page_size); + ASSERT_EQ(expected_page_location.first_row_index, read_page_location.first_row_index); + } +} + +template +void TestReadTypedColumnIndex(const std::string& file_name, int column_id, + size_t num_pages, BoundaryOrder::type boundary_order, + const std::vector& page_indices, + const std::vector& null_pages, + const std::vector& min_values, + const std::vector& max_values, + bool has_null_counts = false, + const std::vector& null_counts = {}) { + std::string dir_string(parquet::test::get_data_dir()); + std::string path = dir_string + "/" + file_name; + auto reader = ParquetFileReader::OpenFile(path, false); + auto file_metadata = reader->metadata(); + + // Get column index location to a specific column chunk. + const int row_group_id = 0; + ASSERT_LT(row_group_id, file_metadata->num_row_groups()); + ASSERT_LT(column_id, file_metadata->num_columns()); + auto index_location = file_metadata->RowGroup(row_group_id) + ->ColumnChunk(column_id) + ->GetColumnIndexLocation(); + ASSERT_TRUE(index_location.has_value()); + + // Read serialized column index from the file. + std::shared_ptr<::arrow::io::RandomAccessFile> source; + PARQUET_ASSIGN_OR_THROW(source, ::arrow::io::ReadableFile::Open(path)); + PARQUET_ASSIGN_OR_THROW(auto buffer, + source->ReadAt(index_location->offset, index_location->length)); + PARQUET_THROW_NOT_OK(source->Close()); + + // Deserialize column index. + auto properties = default_reader_properties(); + auto descr = file_metadata->schema()->Column(column_id); + std::unique_ptr column_index = ColumnIndex::Make( + *descr, buffer->data(), static_cast(buffer->size()), properties); + auto typed_column_index = dynamic_cast*>(column_index.get()); + ASSERT_TRUE(typed_column_index != nullptr); + + // Verify only partial data as there are too many pages. + ASSERT_EQ(num_pages, column_index->null_pages().size()); + ASSERT_EQ(has_null_counts, column_index->has_null_counts()); + ASSERT_EQ(boundary_order, column_index->boundary_order()); + for (size_t i = 0; i < page_indices.size(); ++i) { + size_t page_id = page_indices.at(i); + ASSERT_EQ(null_pages.at(i), column_index->null_pages().at(page_id)); + if (has_null_counts) { + ASSERT_EQ(null_counts.at(i), column_index->null_counts().at(page_id)); + } + // min/max values are only meaningful for non-null pages. + if (!null_pages.at(i)) { + if constexpr (std::is_same_v) { + ASSERT_DOUBLE_EQ(min_values.at(i), typed_column_index->min_values().at(page_id)); + ASSERT_DOUBLE_EQ(max_values.at(i), typed_column_index->max_values().at(page_id)); + } else if constexpr (std::is_same_v) { + ASSERT_FLOAT_EQ(min_values.at(i), typed_column_index->min_values().at(page_id)); + ASSERT_FLOAT_EQ(max_values.at(i), typed_column_index->max_values().at(page_id)); + } else if constexpr (std::is_same_v) { + auto len = descr->type_length(); + ASSERT_EQ(0, ::memcmp(min_values.at(i).ptr, + typed_column_index->min_values().at(page_id).ptr, len)); + ASSERT_EQ(0, ::memcmp(max_values.at(i).ptr, + typed_column_index->max_values().at(page_id).ptr, len)); + } else { + ASSERT_EQ(min_values.at(i), typed_column_index->min_values().at(page_id)); + ASSERT_EQ(max_values.at(i), typed_column_index->max_values().at(page_id)); + } + } + } +} + +TEST(PageIndex, ReadInt64ColumnIndex) { + const int column_id = 5; + const size_t num_pages = 528; + const BoundaryOrder::type boundary_order = BoundaryOrder::Unordered; + const std::vector page_indices = {0, 99, 426, 520}; + const std::vector null_pages = {false, false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {0, 0, 0, 0}; + const std::vector min_values = {0, 10, 0, 0}; + const std::vector max_values = {90, 90, 80, 70}; + + TestReadTypedColumnIndex( + "alltypes_tiny_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); +} + +TEST(PageIndex, ReadDoubleColumnIndex) { + const int column_id = 7; + const size_t num_pages = 528; + const BoundaryOrder::type boundary_order = BoundaryOrder::Unordered; + const std::vector page_indices = {0, 51, 212, 527}; + const std::vector null_pages = {false, false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {0, 0, 0, 0}; + const std::vector min_values = {-0, 30.3, 10.1, 40.4}; + const std::vector max_values = {90.9, 90.9, 90.9, 60.6}; + + TestReadTypedColumnIndex( + "alltypes_tiny_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); +} + +TEST(PageIndex, ReadByteArrayColumnIndex) { + const int column_id = 9; + const size_t num_pages = 352; + const BoundaryOrder::type boundary_order = BoundaryOrder::Ascending; + const std::vector page_indices = {0, 128, 256}; + const std::vector null_pages = {false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {0, 0, 0}; + + // All min values are "0" and max values are "9". + const std::string_view min_value = "0"; + const std::string_view max_value = "9"; + const std::vector min_values = {ByteArray{min_value}, ByteArray{min_value}, + ByteArray{min_value}}; + const std::vector max_values = {ByteArray{max_value}, ByteArray{max_value}, + ByteArray{max_value}}; + + TestReadTypedColumnIndex( + "alltypes_tiny_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); +} + +TEST(PageIndex, ReadBoolColumnIndex) { + const int column_id = 1; + const size_t num_pages = 82; + const BoundaryOrder::type boundary_order = BoundaryOrder::Ascending; + const std::vector page_indices = {0, 16, 64}; + const std::vector null_pages = {false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {0, 0, 0}; + const std::vector min_values = {false, false, false}; + const std::vector max_values = {true, true, true}; + + TestReadTypedColumnIndex( + "alltypes_tiny_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); +} + +TEST(PageIndex, ReadFixedLengthByteArrayColumnIndex) { + auto to_flba = [](const char* ptr) { + return FLBA{reinterpret_cast(ptr)}; + }; + + const int column_id = 0; + const size_t num_pages = 10; + const BoundaryOrder::type boundary_order = BoundaryOrder::Descending; + const std::vector page_indices = {0, 4, 8}; + const std::vector null_pages = {false, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {9, 13, 9}; + const std::vector min_literals = {"\x00\x00\x03\x85", "\x00\x00\x01\xF5", + "\x00\x00\x00\x65"}; + const std::vector max_literals = {"\x00\x00\x03\xE8", "\x00\x00\x02\x58", + "\x00\x00\x00\xC8"}; + const std::vector min_values = { + to_flba(min_literals[0]), to_flba(min_literals[1]), to_flba(min_literals[2])}; + const std::vector max_values = { + to_flba(max_literals[0]), to_flba(max_literals[1]), to_flba(max_literals[2])}; + + TestReadTypedColumnIndex( + "fixed_length_byte_array.parquet", column_id, num_pages, boundary_order, + page_indices, null_pages, min_values, max_values, has_null_counts, null_counts); +} + +TEST(PageIndex, ReadColumnIndexWithNullPage) { + const int column_id = 0; + const size_t num_pages = 10; + const BoundaryOrder::type boundary_order = BoundaryOrder::Unordered; + const std::vector page_indices = {2, 4, 8}; + const std::vector null_pages = {true, false, false}; + const bool has_null_counts = true; + const std::vector null_counts = {100, 16, 8}; + const std::vector min_values = {0, -2048691758, -2046900272}; + const std::vector max_values = {0, 2143189382, 2087168549}; + + TestReadTypedColumnIndex( + "int32_with_null_pages.parquet", column_id, num_pages, boundary_order, page_indices, + null_pages, min_values, max_values, has_null_counts, null_counts); +} + +} // namespace parquet diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 23d7bae59b7..9cc702dfcdd 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -100,6 +100,10 @@ static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type } } +static inline BoundaryOrder::type FromThriftUnsafe(format::BoundaryOrder::type type) { + return static_cast(type); +} + namespace internal { template @@ -130,6 +134,11 @@ struct ThriftEnumTypeTraits<::parquet::format::PageType::type> { using ParquetEnum = PageType; }; +template <> +struct ThriftEnumTypeTraits<::parquet::format::BoundaryOrder::type> { + using ParquetEnum = BoundaryOrder; +}; + // If the parquet file is corrupted it is possible the enum value decoded // will not be in the range of defined values, which is undefined behaviour. // This facility prevents this by loading the value as the underlying type diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 183a3705291..6ec6870d3a0 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -532,6 +532,17 @@ class ColumnOrder { ColumnOrder::type column_order_; }; +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +struct BoundaryOrder { + enum type { + Unordered = 0, + Ascending = 1, + Descending = 2, + // Should always be last element + UNDEFINED = 3 + }; +}; + // ---------------------------------------------------------------------- struct ByteArray { diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index e13af117de7..8a3d3fd5ff7 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit e13af117de7c4f0a4d9908ae3827b3ab119868f3 +Subproject commit 8a3d3fd5ff7691ee07ca9802df66290a3106e4b7 diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 5de5799e0d8..23a9657fd41 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -567,3 +567,17 @@ More specifically, Parquet C++ supports: supported. * EncryptionWithFooterKey and EncryptionWithColumnKey modes. * Encrypted Footer and Plaintext Footer modes. + +Miscellaneous +------------- + ++--------------------------+----------+----------+---------+ +| Feature | Reading | Writing | Notes | ++==========================+==========+==========+=========+ +| Column Index | ✓ | | \(1) | ++--------------------------+----------+----------+---------+ +| Offset Index | ✓ | | \(1) | ++--------------------------+----------+----------+---------+ + +* \(1) Access to the Column and Offset Index structures is provided, but + data read APIs do not currently make any use of them.