From feccee9b38b58de7667e499b625801133c0d6af8 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 10 Oct 2023 19:38:57 +0800 Subject: [PATCH] fix some comment --- cpp/src/parquet/bloom_filter_builder.cc | 13 ++++------- cpp/src/parquet/bloom_filter_builder.h | 23 ++++++++++++------- .../bloom_filter_reader_writer_test.cc | 6 ++--- cpp/src/parquet/file_writer.cc | 1 - cpp/src/parquet/metadata.cc | 6 ++--- cpp/src/parquet/properties.h | 12 +++++----- 6 files changed, 30 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 3853fe9a1f8b9..6d7397af4ac7f 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -49,8 +49,6 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { /// deletes all bloom filters after they have been flushed. void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override; - void Finish() override { finished_ = true; } - private: /// Make sure column ordinal is not out of bound and the builder is in good state. void CheckState(int32_t column_ordinal) const { @@ -100,7 +98,7 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin if (bloom_filter_options_opt == std::nullopt) { return nullptr; } - BloomFilterOptions& bloom_filter_options = bloom_filter_options_opt.value(); + BloomFilterOptions& bloom_filter_options = *bloom_filter_options_opt; std::unique_ptr& bloom_filter = file_bloom_filters_.back()[column_ordinal]; if (bloom_filter == nullptr) { auto block_split_bloom_filter = @@ -114,13 +112,10 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) { - if (!finished_) { - throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder."); - } - if (file_bloom_filters_.empty()) { - // Return quickly if there is no bloom filter - return; + if (finished_) { + throw ParquetException("Cannot call WriteTo() multiple times."); } + finished_ = true; for (size_t row_group_ordinal = 0; row_group_ordinal < file_bloom_filters_.size(); ++row_group_ordinal) { diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index f638ce78e870e..4853e5edd5d22 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -28,6 +28,18 @@ struct BloomFilterOptions; struct BloomFilterLocation; /// \brief Interface for collecting bloom filter of a parquet file. +/// +/// ``` +/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties); +/// for (int i = 0; i < num_row_groups; i++) { +/// bloom_filter_builder->AppendRowGroup(); +/// auto* bloom_filter = +/// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column); +/// // Add bloom filter entries. +/// // ... +/// } +/// bloom_filter_builder->WriteTo(sink, location); +/// ``` class PARQUET_EXPORT BloomFilterBuilder { public: /// \brief API convenience to create a BloomFilterBuilder. @@ -36,7 +48,7 @@ class PARQUET_EXPORT BloomFilterBuilder { /// Append a new row group to host all incoming bloom filters. /// - /// This method must be called before Finish. + /// This method must be called before WriteTo. virtual void AppendRowGroup() = 0; /// \brief Get the BloomFilter from column ordinal. @@ -49,18 +61,13 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \brief Write the bloom filter to sink. /// - /// The bloom filter must have been finished first. + /// The bloom filter cannot be modified after this method is called. /// /// \param[out] sink The output stream to write the bloom filter. - /// \param[out] location The location of all bloom filter to the start of sink. + /// \param[out] location The location of all bloom filter relative to the start of sink. virtual void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) = 0; - /// \brief Complete the bloom filter builder and no more write is allowed. - /// - /// This method must be called before WriteTo. - virtual void Finish() = 0; - virtual ~BloomFilterBuilder() = default; }; diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 94d5c215a2c16..042968aed7ae2 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -94,7 +94,6 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { for (uint64_t hash : insert_hashes) { bloom_filter->InsertHash(hash); } - builder->Finish(); auto sink = CreateOutputStream(); BloomFilterLocation location; builder->WriteTo(sink.get(), &location); @@ -140,11 +139,10 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { builder->GetOrCreateBloomFilter(0); auto sink = CreateOutputStream(); BloomFilterLocation location; - // WriteTo() before Finish() expect throw. - ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException); - builder->Finish(); builder->WriteTo(sink.get(), &location); EXPECT_EQ(1, location.bloom_filter_location.size()); + // Multiple WriteTo() expect throw. + ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException); } } // namespace parquet::test diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 6eed86ee1efa1..ba461e27298ad 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -511,7 +511,6 @@ class FileSerializer : public ParquetFileWriter::Contents { // Serialize page index after all row groups have been written and report // location to the file metadata. BloomFilterLocation bloom_filter_location; - bloom_filter_builder_->Finish(); bloom_filter_builder_->WriteTo(sink_.get(), &bloom_filter_location); metadata_->SetBloomFilterLocation(bloom_filter_location); // Release the memory for BloomFilter. diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 29b0ce0c457c9..9b0a85dfd05e0 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1837,12 +1837,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const auto& row_group_bloom_filter_location = iter->second; for (size_t i = 0; i < row_group_bloom_filter_location.size(); ++i) { DCHECK(i < row_group_metadata.columns.size()); - auto& column = row_group_metadata.columns.at(i); + auto& column = row_group_metadata.columns[i]; auto& column_metadata = column.meta_data; - const auto& bloom_filter_location = row_group_bloom_filter_location.at(i); + const auto& bloom_filter_location = row_group_bloom_filter_location[i]; if (bloom_filter_location.has_value()) { column_metadata.__set_bloom_filter_offset(bloom_filter_location->offset); - // TODO(mwish): Allow this after Parquet 2.10 is released. + // TODO(mwish): GH-38181: Allow this after Parquet 2.10 is released. // column_metadata.__set_bloom_filter_length(bloom_filter_location->length); } } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 5b18cf1a1f23b..61555e5e98675 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -143,9 +143,9 @@ static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024; static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05; struct PARQUET_EXPORT BloomFilterOptions { - /// The number of distinct values to expect to be inserted into the bloom. + /// Expected number of distinct values to be inserted into the bloom filter. int32_t ndv = DEFAULT_BLOOM_FILTER_NDV; - /// The false positive probability expected from the bloom. + /// False-positive probability of the bloom filter. double fpp = DEFAULT_BLOOM_FILTER_FPP; }; @@ -199,7 +199,7 @@ class PARQUET_EXPORT ColumnProperties { if (bloom_filter_options) { if (bloom_filter_options->fpp > 1.0 || bloom_filter_options->fpp < 0.0) { throw ParquetException( - "Bloom Filter False positive probability must be between 0.0 and 1.0, got " + + "Bloom filter false-positive probability must fall in [0.0, 1.0], got " + std::to_string(bloom_filter_options->fpp)); } } @@ -582,7 +582,7 @@ class PARQUET_EXPORT WriterProperties { /// Default disabled. /// /// Note: Currently we don't support bloom filter for boolean columns, - /// so if enable bloom filter for boolean columns, it will be ignored. + /// so it will be ignored if the column is of boolean type. Builder* enable_bloom_filter_options(BloomFilterOptions bloom_filter_options, const std::string& path) { bloom_filter_options_[path] = bloom_filter_options; @@ -594,7 +594,7 @@ class PARQUET_EXPORT WriterProperties { /// Default disabled. /// /// Note: Currently we don't support bloom filter for boolean columns, - /// so if enable bloom filter for boolean columns, it will be ignored. + /// so it will be ignored if the column is of boolean type. Builder* enable_bloom_filter_options( BloomFilterOptions bloom_filter_options, const std::shared_ptr& path) { @@ -835,7 +835,7 @@ class PARQUET_EXPORT WriterProperties { // default_column_properties_.bloom_filter_enabled is always false and // cannot be altered by user. Thus we can safely skip checking it here. return std::any_of(column_properties_.begin(), column_properties_.end(), - [](auto& p) { return p.second.bloom_filter_enabled(); }); + [](const auto& p) { return p.second.bloom_filter_enabled(); }); } std::optional bloom_filter_options(