Skip to content

Commit

Permalink
GH-45015: [C++][Parquet] Allow configuring the default footer read si…
Browse files Browse the repository at this point in the history
…ze (#45016)

### Rationale for this change
Reading the footer for a parquet file whose file metadata is >64KB can require multiple round trips to a high latency file system like S3. Allowing this default read size to be configurable allows for reducing the round trips if it's known up front that the file might potentially have a large amount of metadata.

### What changes are included in this PR?
A `footer_read_size_` property is added to parquet `ReaderProperties` along with a getter/setter. This is then utilized in the file reader's `GetFooterReadSize` method.

* GitHub Issue: #45015

Lead-authored-by: Matt Topol <zotthewizard@gmail.com>
Co-authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
zeroshade and mapleFU authored Dec 16, 2024
1 parent da3c6dd commit 4467d41
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 3 deletions.
5 changes: 2 additions & 3 deletions cpp/src/parquet/file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,6 @@ bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) {
}
} // namespace

// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
static constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
static constexpr uint32_t kFooterSize = 8;

// For PARQUET-816
Expand Down Expand Up @@ -482,7 +480,8 @@ class SerializedFile : public ParquetFileReader::Contents {
"Parquet file size is ", source_size_,
" bytes, smaller than the minimum file footer (", kFooterSize, " bytes)");
}
return std::min(source_size_, kDefaultFooterReadSize);

return std::min(static_cast<size_t>(source_size_), properties_.footer_read_size());
}

// Validate the magic bytes and get the length of the full footer.
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000;
// kDefaultStringSizeLimit.
constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000;

// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
constexpr int64_t kDefaultFooterReadSize = 64 * 1024;

class PARQUET_EXPORT ReaderProperties {
public:
explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
Expand Down Expand Up @@ -120,6 +123,12 @@ class PARQUET_EXPORT ReaderProperties {
page_checksum_verification_ = check_crc;
}

// Set the default read size to read the footer from a file. For high latency
// file systems and files with large metadata (>64KB) this can increase performance
// by reducing the number of round-trips to retrieve the entire file metadata.
void set_footer_read_size(size_t size) { footer_read_size_ = size; }
size_t footer_read_size() const { return footer_read_size_; }

private:
MemoryPool* pool_;
int64_t buffer_size_ = kDefaultBufferSize;
Expand All @@ -129,6 +138,7 @@ class PARQUET_EXPORT ReaderProperties {
bool page_checksum_verification_ = false;
// Used with a RecordReader.
bool read_dense_for_nullable_ = false;
size_t footer_read_size_ = kDefaultFooterReadSize;
std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
};

Expand Down
1 change: 1 addition & 0 deletions cpp/src/parquet/properties_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ TEST(TestReaderProperties, Basics) {
ReaderProperties props;

ASSERT_EQ(props.buffer_size(), kDefaultBufferSize);
ASSERT_EQ(props.footer_read_size(), kDefaultFooterReadSize);
ASSERT_FALSE(props.is_buffered_stream_enabled());
ASSERT_FALSE(props.page_checksum_verification());
}
Expand Down

0 comments on commit 4467d41

Please sign in to comment.