Skip to content

Commit

Permalink
resolve comment
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Sep 11, 2023
1 parent f689716 commit 8e9cb16
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 20 deletions.
14 changes: 7 additions & 7 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5649,13 +5649,13 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) {
auto schema = ::arrow::schema(
{::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())});
auto table = ::arrow::TableFromJSON(schema, {R"([
[1, "a" ],
[2, "b" ],
[3, "c" ],
[null, "d"],
[5, null],
[6, "f" ]
])"});
[1, "a"],
[2, "b"],
[3, "c"],
[null, "d"],
[5, null],
[6, "f"]
])"});
WriteFile(writer_properties, table);

ReadBloomFilters(/*expect_num_row_groups=*/2);
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/parquet/file_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,9 @@ class FileSerializer : public ParquetFileWriter::Contents {
// In Parquet standard, the Bloom filter data can be stored before the page indexes
// after all row groups or stored between row groups. We choose to store it before
// the page indexes after all row groups.
// Also, Putting all bloom filters together may provide a good chance to coalesce
// I/Os of different bloom filters. Especially when only one column has enabled it,
// which is the common case.
WriteBloomFilter();
WritePageIndex();

Expand Down
4 changes: 1 addition & 3 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1835,9 +1835,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
if (iter != file_bloom_filter_location.cend()) {
const auto& row_group_bloom_filter_location = iter->second;
for (size_t i = 0; i < row_group_bloom_filter_location.size(); ++i) {
if (i >= row_group_metadata.columns.size()) {
throw ParquetException("Cannot find metadata for column ordinal ", i);
}
DCHECK(i < row_group_metadata.columns.size());
auto& column = row_group_metadata.columns.at(i);
auto& column_metadata = column.meta_data;
const auto& bloom_filter_location = row_group_bloom_filter_location.at(i);
Expand Down
17 changes: 7 additions & 10 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,26 +142,21 @@ static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024;
static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05;

struct BloomFilterOptions {
struct PARQUET_EXPORT BloomFilterOptions {
/// The number of distinct values to expect to be inserted into the bloom.
int32_t ndv = DEFAULT_BLOOM_FILTER_NDV;
/// The false positive probability expected from the bloom.
double fpp = DEFAULT_BLOOM_FILTER_FPP;
};

static constexpr std::optional<BloomFilterOptions> DEFAULT_IS_BLOOM_FILTER_OPTIONS =
std::nullopt;

class PARQUET_EXPORT ColumnProperties {
public:
ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
Compression::type codec = DEFAULT_COMPRESSION_TYPE,
bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE,
bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED,
std::optional<BloomFilterOptions> bloom_filter_options =
DEFAULT_IS_BLOOM_FILTER_OPTIONS)
bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED)
: encoding_(encoding),
codec_(codec),
dictionary_enabled_(dictionary_enabled),
Expand Down Expand Up @@ -204,7 +199,8 @@ class PARQUET_EXPORT ColumnProperties {
if (bloom_filter_options) {
if (bloom_filter_options->fpp > 1.0 || bloom_filter_options->fpp < 0.0) {
throw ParquetException(
"Bloom Filter False positive probability must be between 0.0 and 1.0");
"Bloom Filter False positive probability must be between 0.0 and 1.0, got " +
std::to_string(bloom_filter_options->fpp));
}
}
bloom_filter_options_ = bloom_filter_options;
Expand Down Expand Up @@ -830,8 +826,9 @@ class PARQUET_EXPORT WriterProperties {
}

bool bloom_filter_enabled() const {
// Note: we disallow enable bloom filter by default for now.
// So no need to check `default_column_properties_.bloom_filter_enabled()`.
// Note: We do not encourage enabling bloom filter for all columns. So
// default_column_properties_.bloom_filter_enabled is always false and
// cannot be altered by user. Thus we can safely skip checking it here.
return std::any_of(column_properties_.begin(), column_properties_.end(),
[](auto& p) { return p.second.bloom_filter_enabled(); });
}
Expand Down

0 comments on commit 8e9cb16

Please sign in to comment.