Skip to content

Commit

Permalink
Add table property tracking number of range deletions
Browse files Browse the repository at this point in the history
Summary:
Add a new table property, rocksdb.num.range-deletions, which tracks the
number of range deletions in a block-based table. Range deletions are no
longer counted in rocksdb.num.entries; as discovered in PR facebook#3778, there
are various code paths that implicitly assume that rocksdb.num.entries
counts only true keys, not range deletions.

/cc ajkr nvanbenschoten
Closes facebook#4016
  • Loading branch information
benesch authored and nvanbenschoten committed Jul 11, 2018
1 parent 05b7615 commit 187ebf6
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 15 deletions.
5 changes: 3 additions & 2 deletions db/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ Status BuildTable(
nullptr /* upper_bound */, meta);

// Finish and check for builder errors
bool empty = builder->NumEntries() == 0;
tp = builder->GetTableProperties();
bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
s = c_iter.status();
if (!s.ok() || empty) {
builder->Abandon();
Expand All @@ -170,7 +171,7 @@ Status BuildTable(
meta->fd.file_size = file_size;
meta->marked_for_compaction = builder->NeedCompact();
assert(meta->fd.GetFileSize() > 0);
tp = builder->GetTableProperties();
tp = builder->GetTableProperties(); // refresh now that builder is finished
if (table_properties) {
*table_properties = tp;
}
Expand Down
10 changes: 7 additions & 3 deletions db/compaction_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,12 @@ Status CompactionJob::FinishCompactionOutputFile(
}
sub_compact->outfile.reset();

if (s.ok() && current_entries == 0) {
TableProperties tp;
if (s.ok()) {
tp = sub_compact->builder->GetTableProperties();
}

if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
// If there is nothing to output, no necessary to generate a sst file.
// This happens when the output level is bottom level, at the same time
// the sub_compact output nothing.
Expand All @@ -1125,8 +1130,7 @@ Status CompactionJob::FinishCompactionOutputFile(
}

ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
TableProperties tp;
if (s.ok() && current_entries > 0) {
if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
// Verify that the table is usable
// We set for_compaction to false and don't OptimizeForCompactionTableRead
// here because this is a special case after we finish the table building
Expand Down
47 changes: 39 additions & 8 deletions db/db_properties_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ void ResetTableProperties(TableProperties* tp) {
tp->raw_value_size = 0;
tp->num_data_blocks = 0;
tp->num_entries = 0;
tp->num_range_deletions = 0;
}

void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
Expand All @@ -179,13 +180,15 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
ResetTableProperties(tp);

sscanf(tp_string.c_str(),
"# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64
"# data blocks %" SCNu64 " # entries %" SCNu64
" # range deletions %" SCNu64
" raw key size %" SCNu64
" raw average key size %lf "
" raw value size %" SCNu64
" raw average value size %lf "
" data block size %" SCNu64 " index block size %" SCNu64
" filter block size %" SCNu64,
&tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size,
&tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions, &tp->raw_key_size,
&dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
&tp->index_size, &tp->filter_size);
}
Expand Down Expand Up @@ -218,19 +221,24 @@ void VerifyTableProperties(const TableProperties& base_tp,
ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
}

void GetExpectedTableProperties(TableProperties* expected_tp,
const int kKeySize, const int kValueSize,
const int kKeysPerTable, const int kTableCount,
const int kKeysPerTable,
const int kRangeDeletionsPerTable,
const int kTableCount,
const int kBloomBitsPerKey,
const size_t kBlockSize) {
const int kKeyCount = kTableCount * kKeysPerTable;
const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
const int kAvgSuccessorSize = kKeySize / 5;
const int kEncodingSavePerKey = kKeySize / 4;
expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
expected_tp->raw_value_size = kKeyCount * kValueSize;
expected_tp->raw_key_size = (kKeyCount + kRangeDeletionCount) * (kKeySize + 8);
expected_tp->raw_value_size = (kKeyCount + kRangeDeletionCount) * kValueSize;
expected_tp->num_entries = kKeyCount;
expected_tp->num_range_deletions = kRangeDeletionCount;
expected_tp->num_data_blocks =
kTableCount *
(kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
Expand Down Expand Up @@ -287,6 +295,7 @@ TEST_F(DBPropertiesTest, ValidateSampleNumber) {

TEST_F(DBPropertiesTest, AggregatedTableProperties) {
for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
const int kRangeDeletionsPerTable = 5;
const int kKeysPerTable = 100;
const int kKeySize = 80;
const int kValueSize = 200;
Expand All @@ -305,20 +314,30 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {

DestroyAndReopen(options);

// Hold open a snapshot to prevent range tombstones from being compacted
// away.
ManagedSnapshot snapshot(db_);

Random rnd(5632);
for (int table = 1; table <= kTableCount; ++table) {
for (int i = 0; i < kKeysPerTable; ++i) {
db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
RandomString(&rnd, kValueSize));
}
for (int i = 0; i < kRangeDeletionsPerTable; i++) {
std::string start = RandomString(&rnd, kKeySize);
std::string end = start;
end.resize(kValueSize);
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
}
db_->Flush(FlushOptions());
}
std::string property;
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);

TableProperties expected_tp;
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, kTableCount, kBloomBitsPerKey,
kKeysPerTable, kRangeDeletionsPerTable, kTableCount, kBloomBitsPerKey,
table_options.block_size);

TableProperties output_tp;
Expand Down Expand Up @@ -437,6 +456,7 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {

TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
const int kTableCount = 100;
const int kRangeDeletionsPerTable = 2;
const int kKeysPerTable = 10;
const int kKeySize = 50;
const int kValueSize = 400;
Expand All @@ -462,6 +482,9 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {

DestroyAndReopen(options);

// Hold open a snapshot to prevent range tombstones from being compacted away.
ManagedSnapshot snapshot(db_);

std::string level_tp_strings[kMaxLevel];
std::string tp_string;
TableProperties level_tps[kMaxLevel];
Expand All @@ -471,6 +494,12 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
RandomString(&rnd, kValueSize));
}
for (int i = 0; i < kRangeDeletionsPerTable; i++) {
std::string start = RandomString(&rnd, kKeySize);
std::string end = start;
end.resize(kValueSize);
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
}
db_->Flush(FlushOptions());
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ResetTableProperties(&sum_tp);
Expand All @@ -486,6 +515,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
sum_tp.raw_value_size += level_tps[level].raw_value_size;
sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
sum_tp.num_entries += level_tps[level].num_entries;
sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
}
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
ParseTablePropertiesString(tp_string, &tp);
Expand All @@ -496,13 +526,14 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
if (table > 3) {
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, table, kBloomBitsPerKey,
kKeysPerTable, kRangeDeletionsPerTable, table, kBloomBitsPerKey,
table_options.block_size);
// Gives larger bias here as index block size, filter block size,
// and data block size become much harder to estimate in this test.
VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25);
VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions include/rocksdb/table_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ struct TablePropertiesNames {
static const std::string kRawValueSize;
static const std::string kNumDataBlocks;
static const std::string kNumEntries;
static const std::string kNumRangeDeletions;
static const std::string kFormatVersion;
static const std::string kFixedKeyLen;
static const std::string kFilterPolicy;
Expand Down Expand Up @@ -144,6 +145,8 @@ struct TableProperties {
uint64_t num_data_blocks = 0;
// the number of entries in this table
uint64_t num_entries = 0;
// the number of range deletions in this table
uint64_t num_range_deletions = 0;
// format version, reserved for backward compatibility
uint64_t format_version = 0;
// If 0, key is variable length. Otherwise number of bytes for each key.
Expand Down
3 changes: 1 addition & 2 deletions table/block_based_table_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -429,9 +429,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
r->ioptions.info_log);

} else if (value_type == kTypeRangeDeletion) {
// TODO(wanning&andrewkr) add num_tomestone to table properties
r->range_del_block.Add(key, value);
++r->props.num_entries;
++r->props.num_range_deletions;
r->props.raw_key_size += key.size();
r->props.raw_value_size += value.size();
NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
Expand Down
3 changes: 3 additions & 0 deletions table/meta_blocks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
}
Add(TablePropertiesNames::kNumEntries, props.num_entries);
Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
Add(TablePropertiesNames::kFilterSize, props.filter_size);
Add(TablePropertiesNames::kFormatVersion, props.format_version);
Expand Down Expand Up @@ -210,6 +211,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
{TablePropertiesNames::kNumDataBlocks,
&new_table_properties->num_data_blocks},
{TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
{TablePropertiesNames::kNumRangeDeletions,
&new_table_properties->num_range_deletions},
{TablePropertiesNames::kFormatVersion,
&new_table_properties->format_version},
{TablePropertiesNames::kFixedKeyLen,
Expand Down
5 changes: 5 additions & 0 deletions table/table_properties.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ std::string TableProperties::ToString(
AppendProperty(result, "# data blocks", num_data_blocks, prop_delim,
kv_delim);
AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
AppendProperty(result, "# range deletions", num_range_deletions, prop_delim,
kv_delim);

AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
AppendProperty(result, "raw average key size",
Expand Down Expand Up @@ -155,6 +157,7 @@ void TableProperties::Add(const TableProperties& tp) {
raw_value_size += tp.raw_value_size;
num_data_blocks += tp.num_data_blocks;
num_entries += tp.num_entries;
num_range_deletions += tp.num_range_deletions;
}

const std::string TablePropertiesNames::kDataSize =
Expand All @@ -175,6 +178,8 @@ const std::string TablePropertiesNames::kNumDataBlocks =
"rocksdb.num.data.blocks";
const std::string TablePropertiesNames::kNumEntries =
"rocksdb.num.entries";
const std::string TablePropertiesNames::kNumRangeDeletions =
"rocksdb.num.range-deletions";
const std::string TablePropertiesNames::kFilterPolicy =
"rocksdb.filter.policy";
const std::string TablePropertiesNames::kFormatVersion =
Expand Down

0 comments on commit 187ebf6

Please sign in to comment.