Skip to content

Commit

Permalink
fix some comment
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Oct 10, 2023
1 parent 7c4ff4e commit feccee9
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 31 deletions.
13 changes: 4 additions & 9 deletions cpp/src/parquet/bloom_filter_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder {
/// deletes all bloom filters after they have been flushed.
void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override;

void Finish() override { finished_ = true; }

private:
/// Make sure column ordinal is not out of bound and the builder is in good state.
void CheckState(int32_t column_ordinal) const {
Expand Down Expand Up @@ -100,7 +98,7 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin
if (bloom_filter_options_opt == std::nullopt) {
return nullptr;
}
BloomFilterOptions& bloom_filter_options = bloom_filter_options_opt.value();
BloomFilterOptions& bloom_filter_options = *bloom_filter_options_opt;
std::unique_ptr<BloomFilter>& bloom_filter = file_bloom_filters_.back()[column_ordinal];
if (bloom_filter == nullptr) {
auto block_split_bloom_filter =
Expand All @@ -114,13 +112,10 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin

void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink,
BloomFilterLocation* location) {
if (!finished_) {
throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder.");
}
if (file_bloom_filters_.empty()) {
// Return quickly if there is no bloom filter
return;
if (finished_) {
throw ParquetException("Cannot call WriteTo() multiple times.");
}
finished_ = true;

for (size_t row_group_ordinal = 0; row_group_ordinal < file_bloom_filters_.size();
++row_group_ordinal) {
Expand Down
23 changes: 15 additions & 8 deletions cpp/src/parquet/bloom_filter_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ struct BloomFilterOptions;
struct BloomFilterLocation;

/// \brief Interface for collecting bloom filter of a parquet file.
///
/// ```
/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
/// for (int i = 0; i < num_row_groups; i++) {
/// bloom_filter_builder->AppendRowGroup();
/// auto* bloom_filter =
/// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
/// // Add bloom filter entries.
/// // ...
/// }
/// bloom_filter_builder->WriteTo(sink, location);
/// ```
class PARQUET_EXPORT BloomFilterBuilder {
public:
/// \brief API convenience to create a BloomFilterBuilder.
Expand All @@ -36,7 +48,7 @@ class PARQUET_EXPORT BloomFilterBuilder {

/// Append a new row group to host all incoming bloom filters.
///
/// This method must be called before Finish.
/// This method must be called before WriteTo.
virtual void AppendRowGroup() = 0;

/// \brief Get the BloomFilter from column ordinal.
Expand All @@ -49,18 +61,13 @@ class PARQUET_EXPORT BloomFilterBuilder {

/// \brief Write the bloom filter to sink.
///
/// The bloom filter must have been finished first.
/// The bloom filter cannot be modified after this method is called.
///
/// \param[out] sink The output stream to write the bloom filter.
/// \param[out] location The location of all bloom filter to the start of sink.
/// \param[out] location The location of all bloom filter relative to the start of sink.
virtual void WriteTo(::arrow::io::OutputStream* sink,
BloomFilterLocation* location) = 0;

/// \brief Complete the bloom filter builder and no more write is allowed.
///
/// This method must be called before WriteTo.
virtual void Finish() = 0;

virtual ~BloomFilterBuilder() = default;
};

Expand Down
6 changes: 2 additions & 4 deletions cpp/src/parquet/bloom_filter_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) {
for (uint64_t hash : insert_hashes) {
bloom_filter->InsertHash(hash);
}
builder->Finish();
auto sink = CreateOutputStream();
BloomFilterLocation location;
builder->WriteTo(sink.get(), &location);
Expand Down Expand Up @@ -140,11 +139,10 @@ TEST(BloomFilterBuilderTest, InvalidOperations) {
builder->GetOrCreateBloomFilter(0);
auto sink = CreateOutputStream();
BloomFilterLocation location;
// WriteTo() before Finish() expect throw.
ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException);
builder->Finish();
builder->WriteTo(sink.get(), &location);
EXPECT_EQ(1, location.bloom_filter_location.size());
// Multiple WriteTo() expect throw.
ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException);
}

} // namespace parquet::test
1 change: 0 additions & 1 deletion cpp/src/parquet/file_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,6 @@ class FileSerializer : public ParquetFileWriter::Contents {
// Serialize page index after all row groups have been written and report
// location to the file metadata.
BloomFilterLocation bloom_filter_location;
bloom_filter_builder_->Finish();
bloom_filter_builder_->WriteTo(sink_.get(), &bloom_filter_location);
metadata_->SetBloomFilterLocation(bloom_filter_location);
// Release the memory for BloomFilter.
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1837,12 +1837,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
const auto& row_group_bloom_filter_location = iter->second;
for (size_t i = 0; i < row_group_bloom_filter_location.size(); ++i) {
DCHECK(i < row_group_metadata.columns.size());
auto& column = row_group_metadata.columns.at(i);
auto& column = row_group_metadata.columns[i];
auto& column_metadata = column.meta_data;
const auto& bloom_filter_location = row_group_bloom_filter_location.at(i);
const auto& bloom_filter_location = row_group_bloom_filter_location[i];
if (bloom_filter_location.has_value()) {
column_metadata.__set_bloom_filter_offset(bloom_filter_location->offset);
// TODO(mwish): Allow this after Parquet 2.10 is released.
// TODO(mwish): GH-38181: Allow this after Parquet 2.10 is released.
// column_metadata.__set_bloom_filter_length(bloom_filter_location->length);
}
}
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024;
static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05;

struct PARQUET_EXPORT BloomFilterOptions {
/// The number of distinct values to expect to be inserted into the bloom.
/// Expected number of distinct values to be inserted into the bloom filter.
int32_t ndv = DEFAULT_BLOOM_FILTER_NDV;
/// The false positive probability expected from the bloom.
/// False-positive probability of the bloom filter.
double fpp = DEFAULT_BLOOM_FILTER_FPP;
};

Expand Down Expand Up @@ -199,7 +199,7 @@ class PARQUET_EXPORT ColumnProperties {
if (bloom_filter_options) {
if (bloom_filter_options->fpp > 1.0 || bloom_filter_options->fpp < 0.0) {
throw ParquetException(
"Bloom Filter False positive probability must be between 0.0 and 1.0, got " +
"Bloom filter false-positive probability must fall in [0.0, 1.0], got " +
std::to_string(bloom_filter_options->fpp));
}
}
Expand Down Expand Up @@ -582,7 +582,7 @@ class PARQUET_EXPORT WriterProperties {
/// Default disabled.
///
/// Note: Currently we don't support bloom filter for boolean columns,
/// so if enable bloom filter for boolean columns, it will be ignored.
/// so it will be ignored if the column is of boolean type.
Builder* enable_bloom_filter_options(BloomFilterOptions bloom_filter_options,
const std::string& path) {
bloom_filter_options_[path] = bloom_filter_options;
Expand All @@ -594,7 +594,7 @@ class PARQUET_EXPORT WriterProperties {
/// Default disabled.
///
/// Note: Currently we don't support bloom filter for boolean columns,
/// so if enable bloom filter for boolean columns, it will be ignored.
/// so it will be ignored if the column is of boolean type.
Builder* enable_bloom_filter_options(
BloomFilterOptions bloom_filter_options,
const std::shared_ptr<schema::ColumnPath>& path) {
Expand Down Expand Up @@ -835,7 +835,7 @@ class PARQUET_EXPORT WriterProperties {
// default_column_properties_.bloom_filter_enabled is always false and
// cannot be altered by user. Thus we can safely skip checking it here.
return std::any_of(column_properties_.begin(), column_properties_.end(),
[](auto& p) { return p.second.bloom_filter_enabled(); });
[](const auto& p) { return p.second.bloom_filter_enabled(); });
}

std::optional<BloomFilterOptions> bloom_filter_options(
Expand Down

0 comments on commit feccee9

Please sign in to comment.