-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-18420: [C++][Parquet] Introduce ColumnIndex & OffsetIndex #14803
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
1cf9d9e
ARROW-18420: [C++][Parquet] Introduce ColumnIndex & OffsetIndex
wgtmac 655a786
renaming functions and refine some impls
wgtmac 5175b45
remove per page accessors
wgtmac 80db272
fix lint
wgtmac 776a602
add back GetNonNullPageIndices
wgtmac 275c331
refine comments and use LoadEnumSafe
wgtmac c0e7c25
missing return after Unreachable
wgtmac 4d47e04
remove default in switch but unreachable is required for lint
wgtmac 6e2fd44
fix windows build
wgtmac 80c773c
make windows build happy
wgtmac 9fac4e9
make encoded & decoded values consistent
wgtmac ed31adb
add tests
wgtmac 3d3ec30
fix signature of Decode
wgtmac b3e577f
rebase and add test to cover null_page = true
wgtmac e14ff24
update parquet-testing submodule
wgtmac 3e0aeed
add test for FLBA type
wgtmac 3fa5d62
Nits
pitrou 084aad1
test int32_with_null_pages.parquet
wgtmac 6c3748e
Add cursory doc
pitrou File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,234 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "parquet/page_index.h" | ||
| #include "parquet/encoding.h" | ||
| #include "parquet/exception.h" | ||
| #include "parquet/schema.h" | ||
| #include "parquet/statistics.h" | ||
| #include "parquet/thrift_internal.h" | ||
|
|
||
| #include "arrow/util/unreachable.h" | ||
|
|
||
| #include <limits> | ||
| #include <numeric> | ||
|
|
||
| namespace parquet { | ||
|
|
||
| namespace { | ||
|
|
||
| template <typename DType> | ||
| void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, | ||
| const std::string& input, std::vector<typename DType::c_type>* output, | ||
| size_t output_index) { | ||
| if (ARROW_PREDICT_FALSE(output_index >= output->size())) { | ||
| throw ParquetException("Index out of bound"); | ||
| } | ||
|
|
||
| decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(input.c_str()), | ||
| static_cast<int>(input.size())); | ||
| const auto num_values = decoder->Decode(&output->at(output_index), /*max_values=*/1); | ||
| if (ARROW_PREDICT_FALSE(num_values != 1)) { | ||
| throw ParquetException("Could not decode statistics value"); | ||
| } | ||
| } | ||
|
|
||
| template <> | ||
| void Decode<BooleanType>(std::unique_ptr<BooleanDecoder>& decoder, | ||
| const std::string& input, std::vector<bool>* output, | ||
| size_t output_index) { | ||
| if (ARROW_PREDICT_FALSE(output_index >= output->size())) { | ||
| throw ParquetException("Index out of bound"); | ||
| } | ||
|
|
||
| bool value; | ||
| decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(input.c_str()), | ||
| static_cast<int>(input.size())); | ||
| const auto num_values = decoder->Decode(&value, /*max_values=*/1); | ||
| if (ARROW_PREDICT_FALSE(num_values != 1)) { | ||
| throw ParquetException("Could not decode statistics value"); | ||
| } | ||
| output->at(output_index) = value; | ||
| } | ||
|
|
||
| template <> | ||
| void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& input, | ||
| std::vector<ByteArray>* output, size_t output_index) { | ||
| if (ARROW_PREDICT_FALSE(output_index >= output->size())) { | ||
| throw ParquetException("Index out of bound"); | ||
| } | ||
|
|
||
| if (ARROW_PREDICT_FALSE(input.size() > | ||
| static_cast<size_t>(std::numeric_limits<uint32_t>::max()))) { | ||
| throw ParquetException("Invalid encoded byte array length"); | ||
| } | ||
|
|
||
| output->at(output_index) = {/*len=*/static_cast<uint32_t>(input.size()), | ||
| /*ptr=*/reinterpret_cast<const uint8_t*>(input.data())}; | ||
| } | ||
|
|
||
| template <typename DType> | ||
| class TypedColumnIndexImpl : public TypedColumnIndex<DType> { | ||
| public: | ||
| using T = typename DType::c_type; | ||
|
|
||
| TypedColumnIndexImpl(const ColumnDescriptor& descr, | ||
| const format::ColumnIndex& column_index) | ||
| : column_index_(column_index) { | ||
| // Make sure the number of pages is valid and it does not overflow to int32_t. | ||
| const size_t num_pages = column_index_.null_pages.size(); | ||
| if (num_pages >= static_cast<size_t>(std::numeric_limits<int32_t>::max()) || | ||
| column_index_.min_values.size() != num_pages || | ||
| column_index_.max_values.size() != num_pages || | ||
| (column_index_.__isset.null_counts && | ||
| column_index_.null_counts.size() != num_pages)) { | ||
| throw ParquetException("Invalid column index"); | ||
| } | ||
|
|
||
| const size_t num_non_null_pages = static_cast<size_t>(std::accumulate( | ||
| column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0, | ||
| [](int32_t num_non_null_pages, bool null_page) { | ||
| return num_non_null_pages + (null_page ? 0 : 1); | ||
| })); | ||
| DCHECK_LE(num_non_null_pages, num_pages); | ||
wgtmac marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| // Allocate slots for decoded values. | ||
| min_values_.resize(num_pages); | ||
| max_values_.resize(num_pages); | ||
| non_null_page_indices_.reserve(num_non_null_pages); | ||
|
|
||
| // Decode min and max values according to the physical type. | ||
| // Note that null page are skipped. | ||
| auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr); | ||
| for (size_t i = 0; i < num_pages; ++i) { | ||
| if (!column_index_.null_pages[i]) { | ||
| // The check on `num_pages` has guaranteed the cast below is safe. | ||
| non_null_page_indices_.emplace_back(static_cast<int32_t>(i)); | ||
| Decode<DType>(plain_decoder, column_index_.min_values[i], &min_values_, i); | ||
| Decode<DType>(plain_decoder, column_index_.max_values[i], &max_values_, i); | ||
| } | ||
| } | ||
| DCHECK_EQ(num_non_null_pages, non_null_page_indices_.size()); | ||
| } | ||
|
|
||
| const std::vector<bool>& null_pages() const override { | ||
| return column_index_.null_pages; | ||
| } | ||
|
|
||
| const std::vector<std::string>& encoded_min_values() const override { | ||
| return column_index_.min_values; | ||
| } | ||
|
|
||
| const std::vector<std::string>& encoded_max_values() const override { | ||
| return column_index_.max_values; | ||
| } | ||
|
|
||
| BoundaryOrder::type boundary_order() const override { | ||
| return LoadEnumSafe(&column_index_.boundary_order); | ||
| } | ||
|
|
||
| bool has_null_counts() const override { return column_index_.__isset.null_counts; } | ||
|
|
||
| const std::vector<int64_t>& null_counts() const override { | ||
| return column_index_.null_counts; | ||
| } | ||
|
|
||
| const std::vector<int32_t>& non_null_page_indices() const override { | ||
| return non_null_page_indices_; | ||
| } | ||
|
|
||
| const std::vector<T>& min_values() const override { return min_values_; } | ||
|
|
||
| const std::vector<T>& max_values() const override { return max_values_; } | ||
|
|
||
| private: | ||
| /// Wrapped thrift column index. | ||
| const format::ColumnIndex column_index_; | ||
| /// Decoded typed min/max values. Undefined for null pages. | ||
| std::vector<T> min_values_; | ||
| std::vector<T> max_values_; | ||
| /// A list of page indices for non-null pages. | ||
| std::vector<int32_t> non_null_page_indices_; | ||
| }; | ||
|
|
||
| class OffsetIndexImpl : public OffsetIndex { | ||
| public: | ||
| explicit OffsetIndexImpl(const format::OffsetIndex& offset_index) { | ||
| page_locations_.reserve(offset_index.page_locations.size()); | ||
| for (const auto& page_location : offset_index.page_locations) { | ||
wgtmac marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| page_locations_.emplace_back(PageLocation{page_location.offset, | ||
| page_location.compressed_page_size, | ||
| page_location.first_row_index}); | ||
| } | ||
| } | ||
|
|
||
| const std::vector<PageLocation>& page_locations() const override { | ||
| return page_locations_; | ||
| } | ||
|
|
||
| private: | ||
| std::vector<PageLocation> page_locations_; | ||
| }; | ||
|
|
||
| } // namespace | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // Public factory functions | ||
|
|
||
| std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr, | ||
| const void* serialized_index, | ||
| uint32_t index_len, | ||
| const ReaderProperties& properties) { | ||
| format::ColumnIndex column_index; | ||
| ThriftDeserializer deserializer(properties); | ||
| deserializer.DeserializeMessage(reinterpret_cast<const uint8_t*>(serialized_index), | ||
| &index_len, &column_index); | ||
| switch (descr.physical_type()) { | ||
| case Type::BOOLEAN: | ||
| return std::make_unique<TypedColumnIndexImpl<BooleanType>>(descr, column_index); | ||
| case Type::INT32: | ||
| return std::make_unique<TypedColumnIndexImpl<Int32Type>>(descr, column_index); | ||
| case Type::INT64: | ||
| return std::make_unique<TypedColumnIndexImpl<Int64Type>>(descr, column_index); | ||
| case Type::INT96: | ||
| return std::make_unique<TypedColumnIndexImpl<Int96Type>>(descr, column_index); | ||
| case Type::FLOAT: | ||
| return std::make_unique<TypedColumnIndexImpl<FloatType>>(descr, column_index); | ||
| case Type::DOUBLE: | ||
| return std::make_unique<TypedColumnIndexImpl<DoubleType>>(descr, column_index); | ||
| case Type::BYTE_ARRAY: | ||
| return std::make_unique<TypedColumnIndexImpl<ByteArrayType>>(descr, column_index); | ||
| case Type::FIXED_LEN_BYTE_ARRAY: | ||
| return std::make_unique<TypedColumnIndexImpl<FLBAType>>(descr, column_index); | ||
| case Type::UNDEFINED: | ||
| return nullptr; | ||
| } | ||
| ::arrow::Unreachable("Cannot make ColumnIndex of an unknown type"); | ||
| return nullptr; | ||
| } | ||
|
|
||
| std::unique_ptr<OffsetIndex> OffsetIndex::Make(const void* serialized_index, | ||
| uint32_t index_len, | ||
| const ReaderProperties& properties) { | ||
| format::OffsetIndex offset_index; | ||
| ThriftDeserializer deserializer(properties); | ||
| deserializer.DeserializeMessage(reinterpret_cast<const uint8_t*>(serialized_index), | ||
| &index_len, &offset_index); | ||
| return std::make_unique<OffsetIndexImpl>(offset_index); | ||
| } | ||
|
|
||
| } // namespace parquet | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.