From a266f8c8cf8481c1ed34288724bbbb750033b311 Mon Sep 17 00:00:00 2001 From: Nikhil Benesch Date: Mon, 18 Jun 2018 16:42:52 -0400 Subject: [PATCH] Add table property tracking number of range deletions Add a new table property, rocksdb.num.range-deletions, which tracks the number of range deletions in a block-based table. Range deletions are no longer counted in rocksdb.num.entries; as discovered in PR #3778, there are various code paths that implicitly assume that rocksdb.num.entries counts only true keys, not range deletions. --- HISTORY.md | 2 ++ db/builder.cc | 5 +-- db/compaction_job.cc | 11 +++--- db/db_properties_test.cc | 56 ++++++++++++++++++++++++------ include/rocksdb/table_properties.h | 3 ++ table/block_based_table_builder.cc | 3 +- table/meta_blocks.cc | 3 ++ table/table_properties.cc | 5 +++ 8 files changed, 69 insertions(+), 19 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c633014f6ed..7e9382f04db 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,9 +3,11 @@ ### Public API Change * For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed. * With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents. +* The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries. ### New Features * Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used. +* Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table. ### Bug Fixes * fix deadlock with enable_pipelined_write=true and max_successive_merges > 0 diff --git a/db/builder.cc b/db/builder.cc index e57ad0208be..d592e201237 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -159,7 +159,8 @@ Status BuildTable( nullptr /* upper_bound */, meta); // Finish and check for builder errors - bool empty = builder->NumEntries() == 0; + tp = builder->GetTableProperties(); + bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0; s = c_iter.status(); if (!s.ok() || empty) { builder->Abandon(); @@ -172,7 +173,7 @@ Status BuildTable( meta->fd.file_size = file_size; meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); - tp = builder->GetTableProperties(); + tp = builder->GetTableProperties(); // refresh now that builder is finished if (table_properties) { *table_properties = tp; } diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 8b9fe7a9723..8fd0c00aa13 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1217,7 +1217,12 @@ Status CompactionJob::FinishCompactionOutputFile( } sub_compact->outfile.reset(); - if (s.ok() && current_entries == 0) { + TableProperties tp; + if (s.ok()) { + tp = sub_compact->builder->GetTableProperties(); + } + + if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { // If there is nothing to output, no necessary to generate a sst file. // This happens when the output level is bottom level, at the same time // the sub_compact output nothing. @@ -1236,10 +1241,8 @@ Status CompactionJob::FinishCompactionOutputFile( } ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); - TableProperties tp; - if (s.ok() && current_entries > 0) { + if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { // Output to event logger and fire events. - tp = sub_compact->builder->GetTableProperties(); sub_compact->current_output()->table_properties = std::make_shared(tp); ROCKS_LOG_INFO(db_options_.info_log, diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index b38fe0352b8..819758e3f81 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -170,6 +170,7 @@ void ResetTableProperties(TableProperties* tp) { tp->raw_value_size = 0; tp->num_data_blocks = 0; tp->num_entries = 0; + tp->num_range_deletions = 0; } void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { @@ -178,15 +179,18 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { std::replace(tp_string.begin(), tp_string.end(), '=', ' '); ResetTableProperties(tp); sscanf(tp_string.c_str(), - "# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64 + "# data blocks %" SCNu64 " # entries %" SCNu64 + " # range deletions %" SCNu64 + " raw key size %" SCNu64 " raw average key size %lf " " raw value size %" SCNu64 " raw average value size %lf " " data block size %" SCNu64 " index block size (user-key? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, - &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size, - &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, - &tp->index_key_is_user_key, &tp->index_size, &tp->filter_size); + &tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions, + &tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double, + &tp->data_size, &tp->index_key_is_user_key, &tp->index_size, + &tp->filter_size); } void VerifySimilar(uint64_t a, uint64_t b, double bias) { @@ -217,20 +221,25 @@ void VerifyTableProperties(const TableProperties& base_tp, ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size); ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size); ASSERT_EQ(base_tp.num_entries, new_tp.num_entries); + ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions); } void GetExpectedTableProperties(TableProperties* expected_tp, const int kKeySize, const int kValueSize, - const int kKeysPerTable, const int kTableCount, + const int kKeysPerTable, + const int kRangeDeletionsPerTable, + const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize, const bool index_key_is_user_key) { const int kKeyCount = kTableCount * kKeysPerTable; + const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; const int kAvgSuccessorSize = kKeySize / 5; const int kEncodingSavePerKey = kKeySize / 4; - expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); - expected_tp->raw_value_size = kKeyCount * kValueSize; + expected_tp->raw_key_size = (kKeyCount + kRangeDeletionCount) * (kKeySize + 8); + expected_tp->raw_value_size = (kKeyCount + kRangeDeletionCount) * kValueSize; expected_tp->num_entries = kKeyCount; + expected_tp->num_range_deletions = kRangeDeletionCount; expected_tp->num_data_blocks = kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / @@ -291,6 +300,7 @@ TEST_F(DBPropertiesTest, ValidateSampleNumber) { TEST_F(DBPropertiesTest, AggregatedTableProperties) { for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) { + const int kRangeDeletionsPerTable = 5; const int kKeysPerTable = 100; const int kKeySize = 80; const int kValueSize = 200; @@ -309,12 +319,22 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { DestroyAndReopen(options); + // Hold open a snapshot to prevent range tombstones from being compacted + // away. + ManagedSnapshot snapshot(db_); + Random rnd(5632); for (int table = 1; table <= kTableCount; ++table) { for (int i = 0; i < kKeysPerTable; ++i) { db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); } + for (int i = 0; i < kRangeDeletionsPerTable; i++) { + std::string start = RandomString(&rnd, kKeySize); + std::string end = start; + end.resize(kValueSize); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end); + } db_->Flush(FlushOptions()); } std::string property; @@ -325,7 +345,8 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { TableProperties expected_tp; GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, - kKeysPerTable, kTableCount, kBloomBitsPerKey, + kKeysPerTable, kRangeDeletionsPerTable, + kTableCount, kBloomBitsPerKey, table_options.block_size, index_key_is_user_key); VerifyTableProperties(expected_tp, output_tp); @@ -448,6 +469,7 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) { TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { const int kTableCount = 100; + const int kRangeDeletionsPerTable = 2; const int kKeysPerTable = 10; const int kKeySize = 50; const int kValueSize = 400; @@ -473,6 +495,9 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { DestroyAndReopen(options); + // Hold open a snapshot to prevent range tombstones from being compacted away. + ManagedSnapshot snapshot(db_); + std::string level_tp_strings[kMaxLevel]; std::string tp_string; TableProperties level_tps[kMaxLevel]; @@ -482,6 +507,12 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); } + for (int i = 0; i < kRangeDeletionsPerTable; i++) { + std::string start = RandomString(&rnd, kKeySize); + std::string end = start; + end.resize(kValueSize); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end); + } db_->Flush(FlushOptions()); db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); ResetTableProperties(&sum_tp); @@ -497,6 +528,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { sum_tp.raw_value_size += level_tps[level].raw_value_size; sum_tp.num_data_blocks += level_tps[level].num_data_blocks; sum_tp.num_entries += level_tps[level].num_entries; + sum_tp.num_range_deletions += level_tps[level].num_range_deletions; } db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); ParseTablePropertiesString(tp_string, &tp); @@ -508,13 +540,15 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size); ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); ASSERT_EQ(sum_tp.num_entries, tp.num_entries); + ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions); if (table > 3) { GetExpectedTableProperties( - &expected_tp, kKeySize, kValueSize, kKeysPerTable, table, - kBloomBitsPerKey, table_options.block_size, index_key_is_user_key); + &expected_tp, kKeySize, kValueSize, kKeysPerTable, + kRangeDeletionsPerTable, table, kBloomBitsPerKey, + table_options.block_size, index_key_is_user_key); // Gives larger bias here as index block size, filter block size, // and data block size become much harder to estimate in this test. - VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25); + VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25); } } } diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 18165922a49..32ddb6c9837 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -39,6 +39,7 @@ struct TablePropertiesNames { static const std::string kRawValueSize; static const std::string kNumDataBlocks; static const std::string kNumEntries; + static const std::string kNumRangeDeletions; static const std::string kFormatVersion; static const std::string kFixedKeyLen; static const std::string kFilterPolicy; @@ -148,6 +149,8 @@ struct TableProperties { uint64_t num_data_blocks = 0; // the number of entries in this table uint64_t num_entries = 0; + // the number of range deletions in this table + uint64_t num_range_deletions = 0; // format version, reserved for backward compatibility uint64_t format_version = 0; // If 0, key is variable length. Otherwise number of bytes for each key. diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 030e2fb2fc9..7d7faf60d34 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -444,9 +444,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->ioptions.info_log); } else if (value_type == kTypeRangeDeletion) { - // TODO(wanning&andrewkr) add num_tomestone to table properties r->range_del_block.Add(key, value); - ++r->props.num_entries; + ++r->props.num_range_deletions; r->props.raw_key_size += key.size(); r->props.raw_value_size += value.size(); NotifyCollectTableCollectorsOnAdd(key, value, r->offset, diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index ce508ce1f87..d6076affa9e 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -77,6 +77,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { } Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key); Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); Add(TablePropertiesNames::kFilterSize, props.filter_size); Add(TablePropertiesNames::kFormatVersion, props.format_version); @@ -224,6 +225,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, {TablePropertiesNames::kNumDataBlocks, &new_table_properties->num_data_blocks}, {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kNumRangeDeletions, + &new_table_properties->num_range_deletions}, {TablePropertiesNames::kFormatVersion, &new_table_properties->format_version}, {TablePropertiesNames::kFixedKeyLen, diff --git a/table/table_properties.cc b/table/table_properties.cc index 4d75abdb315..9c1c4bd8eb4 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -78,6 +78,8 @@ std::string TableProperties::ToString( AppendProperty(result, "# data blocks", num_data_blocks, prop_delim, kv_delim); AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); + AppendProperty(result, "# range deletions", num_range_deletions, prop_delim, + kv_delim); AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); AppendProperty(result, "raw average key size", @@ -166,6 +168,7 @@ void TableProperties::Add(const TableProperties& tp) { raw_value_size += tp.raw_value_size; num_data_blocks += tp.num_data_blocks; num_entries += tp.num_entries; + num_range_deletions += tp.num_range_deletions; } const std::string TablePropertiesNames::kDataSize = @@ -188,6 +191,8 @@ const std::string TablePropertiesNames::kNumDataBlocks = "rocksdb.num.data.blocks"; const std::string TablePropertiesNames::kNumEntries = "rocksdb.num.entries"; +const std::string TablePropertiesNames::kNumRangeDeletions = + "rocksdb.num.range-deletions"; const std::string TablePropertiesNames::kFilterPolicy = "rocksdb.filter.policy"; const std::string TablePropertiesNames::kFormatVersion =