Skip to content

Commit 62fc15f

Browse files
cbi42facebook-github-bot
authored andcommitted
Block per key-value checksum (facebook#11287)
Summary: add option `block_protection_bytes_per_key` and implementation for block per key-value checksum. The main changes are 1. checksum construction and verification in block.cc/h 2. pass the option `block_protection_bytes_per_key` around (mainly for methods defined in table_cache.h) 3. unit tests/crash test updates Tests: * Added unit tests * Crash test: `python3 tools/db_crashtest.py blackbox --simple --block_protection_bytes_per_key=1 --write_buffer_size=1048576` Follow up (maybe as a separate PR): make sure corruption status returned from BlockIters are correctly handled. Performance: Turning on block per KV protection has a non-trivial negative impact on read performance and costs additional memory. For memory, each block includes additional 24 bytes for checksum-related states beside checksum itself. For CPU, I set up a DB of size ~1.2GB with 5M keys (32 bytes key and 200 bytes value) which compacts to ~5 SST files (target file size 256 MB) in L6 without compression. I tested readrandom performance with various block cache size (to mimic various cache hit rates): ``` SETUP make OPTIMIZE_LEVEL="-O3" USE_LTO=1 DEBUG_LEVEL=0 -j32 db_bench ./db_bench -benchmarks=fillseq,compact0,waitforcompaction,compact,waitforcompaction -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -target_file_size_base=268435456 --num=5000000 --key_size=32 --value_size=200 --compression_type=none BENCHMARK ./db_bench --use_existing_db -benchmarks=readtocache,readrandom[-X10] --num=5000000 --key_size=32 --disable_auto_compactions --reads=1000000 --block_protection_bytes_per_key=[0|1] --cache_size=$CACHESIZE The readrandom ops/sec looks like the following: Block cache size: 2GB 1.2GB * 0.9 1.2GB * 0.8 1.2GB * 0.5 8MB Main 240805 223604 198176 161653 139040 PR prot_bytes=0 238691 226693 200127 161082 141153 PR prot_bytes=1 214983 193199 178532 137013 108211 prot_bytes=1 vs -10% -15% -10.8% -15% -23% prot_bytes=0 ``` The benchmark has a lot of variance, but there was a 5% to 25% regression in this benchmark with different cache hit rates. Pull Request resolved: facebook#11287 Reviewed By: ajkr Differential Revision: D43970708 Pulled By: cbi42 fbshipit-source-id: ef98d898b71779846fa74212b9ec9e08b7183940
1 parent 40d69b5 commit 62fc15f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1695
-229
lines changed

HISTORY.md

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Rocksdb Change Log
22
## Unreleased
33
### New Features
4+
* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287).
45

56
## 8.2.0 (04/24/2023)
67
### Public API Changes

db/builder.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,8 @@ Status BuildTable(
380380
MaxFileSizeForL0MetaPin(mutable_cf_options),
381381
/*smallest_compaction_key=*/nullptr,
382382
/*largest_compaction_key*/ nullptr,
383-
/*allow_unprepared_value*/ false));
383+
/*allow_unprepared_value*/ false,
384+
mutable_cf_options.block_protection_bytes_per_key));
384385
s = it->status();
385386
if (s.ok() && paranoid_file_checks) {
386387
OutputValidator file_validator(tboptions.internal_comparator,

db/column_family.cc

+6
Original file line numberDiff line numberDiff line change
@@ -1428,6 +1428,12 @@ Status ColumnFamilyData::ValidateOptions(
14281428
"Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
14291429
"or 8 bytes per key.");
14301430
}
1431+
if (std::find(supported.begin(), supported.end(),
1432+
cf_options.block_protection_bytes_per_key) == supported.end()) {
1433+
return Status::NotSupported(
1434+
"Block per key-value checksum protection only supports 0, 1, 2, 4 "
1435+
"or 8 bytes per key.");
1436+
}
14311437
return s;
14321438
}
14331439

db/compaction/compaction_job.cc

+6-2
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,9 @@ void CompactionJob::GenSubcompactionBoundaries() {
504504
FileMetaData* f = flevel->files[i].file_metadata;
505505
std::vector<TableReader::Anchor> my_anchors;
506506
Status s = cfd->table_cache()->ApproximateKeyAnchors(
507-
read_options, icomp, *f, my_anchors);
507+
read_options, icomp, *f,
508+
c->mutable_cf_options()->block_protection_bytes_per_key,
509+
my_anchors);
508510
if (!s.ok() || my_anchors.empty()) {
509511
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
510512
}
@@ -735,7 +737,9 @@ Status CompactionJob::Run() {
735737
*compact_->compaction->mutable_cf_options()),
736738
/*smallest_compaction_key=*/nullptr,
737739
/*largest_compaction_key=*/nullptr,
738-
/*allow_unprepared_value=*/false);
740+
/*allow_unprepared_value=*/false,
741+
compact_->compaction->mutable_cf_options()
742+
->block_protection_bytes_per_key);
739743
auto s = iter->status();
740744

741745
if (s.ok() && paranoid_file_checks_) {

db/compaction/compaction_job_test.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,8 @@ class CompactionJobTestBase : public testing::Test {
454454
Status s = cf_options_.table_factory->NewTableReader(
455455
read_opts,
456456
TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
457-
cfd_->internal_comparator()),
457+
cfd_->internal_comparator(),
458+
0 /* block_protection_bytes_per_key */),
458459
std::move(freader), file_size, &table_reader, false);
459460
ASSERT_OK(s);
460461
assert(table_reader);

db/convenience.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ Status VerifySstFileChecksum(const Options& options,
6464
const bool kImmortal = true;
6565
auto reader_options = TableReaderOptions(
6666
ioptions, options.prefix_extractor, env_options, internal_comparator,
67-
false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
68-
-1 /* level */);
67+
options.block_protection_bytes_per_key, false /* skip_filters */,
68+
!kImmortal, false /* force_direct_prefetch */, -1 /* level */);
6969
reader_options.largest_seqno = largest_seqno;
7070
s = ioptions.table_factory->NewTableReader(
7171
reader_options, std::move(file_reader), file_size, &table_reader,

db/external_sst_file_ingestion_job.cc

+1
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
678678
TableReaderOptions(
679679
*cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
680680
env_options_, cfd_->internal_comparator(),
681+
sv->mutable_cf_options.block_protection_bytes_per_key,
681682
/*skip_filters*/ false, /*immortal*/ false,
682683
/*force_direct_prefetch*/ false, /*level*/ -1,
683684
/*block_cache_tracer*/ nullptr,

db/forward_iterator.cc

+14-7
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class ForwardLevelIterator : public InternalIterator {
3636
const ColumnFamilyData* const cfd, const ReadOptions& read_options,
3737
const std::vector<FileMetaData*>& files,
3838
const std::shared_ptr<const SliceTransform>& prefix_extractor,
39-
bool allow_unprepared_value)
39+
bool allow_unprepared_value, uint8_t block_protection_bytes_per_key)
4040
: cfd_(cfd),
4141
read_options_(read_options),
4242
files_(files),
@@ -45,7 +45,8 @@ class ForwardLevelIterator : public InternalIterator {
4545
file_iter_(nullptr),
4646
pinned_iters_mgr_(nullptr),
4747
prefix_extractor_(prefix_extractor),
48-
allow_unprepared_value_(allow_unprepared_value) {
48+
allow_unprepared_value_(allow_unprepared_value),
49+
block_protection_bytes_per_key_(block_protection_bytes_per_key) {
4950
status_.PermitUncheckedError(); // Allow uninitialized status through
5051
}
5152

@@ -87,7 +88,8 @@ class ForwardLevelIterator : public InternalIterator {
8788
/*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
8889
/*max_file_size_for_l0_meta_pin=*/0,
8990
/*smallest_compaction_key=*/nullptr,
90-
/*largest_compaction_key=*/nullptr, allow_unprepared_value_);
91+
/*largest_compaction_key=*/nullptr, allow_unprepared_value_,
92+
block_protection_bytes_per_key_);
9193
file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
9294
valid_ = false;
9395
if (!range_del_agg.IsEmpty()) {
@@ -211,6 +213,7 @@ class ForwardLevelIterator : public InternalIterator {
211213
// Kept alive by ForwardIterator::sv_->mutable_cf_options
212214
const std::shared_ptr<const SliceTransform>& prefix_extractor_;
213215
const bool allow_unprepared_value_;
216+
const uint8_t block_protection_bytes_per_key_;
214217
};
215218

216219
ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
@@ -738,7 +741,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
738741
/*skip_filters=*/false, /*level=*/-1,
739742
MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
740743
/*smallest_compaction_key=*/nullptr,
741-
/*largest_compaction_key=*/nullptr, allow_unprepared_value_));
744+
/*largest_compaction_key=*/nullptr, allow_unprepared_value_,
745+
sv_->mutable_cf_options.block_protection_bytes_per_key));
742746
}
743747
BuildLevelIterators(vstorage, sv_);
744748
current_ = nullptr;
@@ -819,7 +823,8 @@ void ForwardIterator::RenewIterators() {
819823
/*skip_filters=*/false, /*level=*/-1,
820824
MaxFileSizeForL0MetaPin(svnew->mutable_cf_options),
821825
/*smallest_compaction_key=*/nullptr,
822-
/*largest_compaction_key=*/nullptr, allow_unprepared_value_));
826+
/*largest_compaction_key=*/nullptr, allow_unprepared_value_,
827+
svnew->mutable_cf_options.block_protection_bytes_per_key));
823828
}
824829

825830
for (auto* f : l0_iters_) {
@@ -863,7 +868,8 @@ void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage,
863868
} else {
864869
level_iters_.push_back(new ForwardLevelIterator(
865870
cfd_, read_options_, level_files,
866-
sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_));
871+
sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_,
872+
sv->mutable_cf_options.block_protection_bytes_per_key));
867873
}
868874
}
869875
}
@@ -885,7 +891,8 @@ void ForwardIterator::ResetIncompleteIterators() {
885891
/*skip_filters=*/false, /*level=*/-1,
886892
MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
887893
/*smallest_compaction_key=*/nullptr,
888-
/*largest_compaction_key=*/nullptr, allow_unprepared_value_);
894+
/*largest_compaction_key=*/nullptr, allow_unprepared_value_,
895+
sv_->mutable_cf_options.block_protection_bytes_per_key);
889896
l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
890897
}
891898

db/import_column_family_job.cc

+1
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
250250
TableReaderOptions(
251251
*cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
252252
env_options_, cfd_->internal_comparator(),
253+
sv->mutable_cf_options.block_protection_bytes_per_key,
253254
/*skip_filters*/ false, /*immortal*/ false,
254255
/*force_direct_prefetch*/ false, /*level*/ -1,
255256
/*block_cache_tracer*/ nullptr,

db/kv_checksum.h

+92-6
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ template <typename T>
4646
class ProtectionInfoKVOC;
4747
template <typename T>
4848
class ProtectionInfoKVOS;
49+
template <typename T>
50+
class ProtectionInfoKV;
4951

5052
// Aliases for 64-bit protection infos.
5153
using ProtectionInfo64 = ProtectionInfo<uint64_t>;
@@ -64,13 +66,13 @@ class ProtectionInfo {
6466
ProtectionInfoKVO<T> ProtectKVO(const SliceParts& key,
6567
const SliceParts& value,
6668
ValueType op_type) const;
67-
68-
T GetVal() const { return val_; }
69+
ProtectionInfoKV<T> ProtectKV(const Slice& key, const Slice& value) const;
6970

7071
private:
7172
friend class ProtectionInfoKVO<T>;
7273
friend class ProtectionInfoKVOS<T>;
7374
friend class ProtectionInfoKVOC<T>;
75+
friend class ProtectionInfoKV<T>;
7476

7577
// Each field is hashed with an independent value so we can catch fields being
7678
// swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall,
@@ -89,8 +91,47 @@ class ProtectionInfo {
8991
static_assert(sizeof(ProtectionInfo<T>) == sizeof(T), "");
9092
}
9193

94+
T GetVal() const { return val_; }
9295
void SetVal(T val) { val_ = val; }
9396

97+
void Encode(uint8_t len, char* dst) const {
98+
assert(sizeof(val_) >= len);
99+
switch (len) {
100+
case 1:
101+
dst[0] = static_cast<uint8_t>(val_);
102+
break;
103+
case 2:
104+
EncodeFixed16(dst, static_cast<uint16_t>(val_));
105+
break;
106+
case 4:
107+
EncodeFixed32(dst, static_cast<uint32_t>(val_));
108+
break;
109+
case 8:
110+
EncodeFixed64(dst, static_cast<uint64_t>(val_));
111+
break;
112+
default:
113+
assert(false);
114+
}
115+
}
116+
117+
bool Verify(uint8_t len, const char* checksum_ptr) const {
118+
assert(sizeof(val_) >= len);
119+
switch (len) {
120+
case 1:
121+
return static_cast<uint8_t>(checksum_ptr[0]) ==
122+
static_cast<uint8_t>(val_);
123+
case 2:
124+
return DecodeFixed16(checksum_ptr) == static_cast<uint16_t>(val_);
125+
case 4:
126+
return DecodeFixed32(checksum_ptr) == static_cast<uint32_t>(val_);
127+
case 8:
128+
return DecodeFixed64(checksum_ptr) == static_cast<uint64_t>(val_);
129+
default:
130+
assert(false);
131+
return false;
132+
}
133+
}
134+
94135
T val_ = 0;
95136
};
96137

@@ -113,7 +154,14 @@ class ProtectionInfoKVO {
113154
void UpdateV(const SliceParts& old_value, const SliceParts& new_value);
114155
void UpdateO(ValueType old_op_type, ValueType new_op_type);
115156

116-
T GetVal() const { return info_.GetVal(); }
157+
// Encode this protection info into `len` bytes and stores them in `dst`.
158+
void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); }
159+
// Verify this protection info against the protection info encoded by Encode()
160+
// at the first `len` bytes of `checksum_ptr`.
161+
// Returns true iff the verification is successful.
162+
bool Verify(uint8_t len, const char* checksum_ptr) const {
163+
return info_.Verify(len, checksum_ptr);
164+
}
117165

118166
private:
119167
friend class ProtectionInfo<T>;
@@ -124,6 +172,7 @@ class ProtectionInfoKVO {
124172
static_assert(sizeof(ProtectionInfoKVO<T>) == sizeof(T), "");
125173
}
126174

175+
T GetVal() const { return info_.GetVal(); }
127176
void SetVal(T val) { info_.SetVal(val); }
128177

129178
ProtectionInfo<T> info_;
@@ -154,7 +203,10 @@ class ProtectionInfoKVOC {
154203
void UpdateC(ColumnFamilyId old_column_family_id,
155204
ColumnFamilyId new_column_family_id);
156205

157-
T GetVal() const { return kvo_.GetVal(); }
206+
void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); }
207+
bool Verify(uint8_t len, const char* checksum_ptr) const {
208+
return kvo_.Verify(len, checksum_ptr);
209+
}
158210

159211
private:
160212
friend class ProtectionInfoKVO<T>;
@@ -163,6 +215,7 @@ class ProtectionInfoKVOC {
163215
static_assert(sizeof(ProtectionInfoKVOC<T>) == sizeof(T), "");
164216
}
165217

218+
T GetVal() const { return kvo_.GetVal(); }
166219
void SetVal(T val) { kvo_.SetVal(val); }
167220

168221
ProtectionInfoKVO<T> kvo_;
@@ -193,7 +246,10 @@ class ProtectionInfoKVOS {
193246
void UpdateS(SequenceNumber old_sequence_number,
194247
SequenceNumber new_sequence_number);
195248

196-
T GetVal() const { return kvo_.GetVal(); }
249+
void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); }
250+
bool Verify(uint8_t len, const char* checksum_ptr) const {
251+
return kvo_.Verify(len, checksum_ptr);
252+
}
197253

198254
private:
199255
friend class ProtectionInfoKVO<T>;
@@ -202,11 +258,32 @@ class ProtectionInfoKVOS {
202258
static_assert(sizeof(ProtectionInfoKVOS<T>) == sizeof(T), "");
203259
}
204260

261+
T GetVal() const { return kvo_.GetVal(); }
205262
void SetVal(T val) { kvo_.SetVal(val); }
206263

207264
ProtectionInfoKVO<T> kvo_;
208265
};
209266

267+
template <typename T>
268+
class ProtectionInfoKV {
269+
public:
270+
ProtectionInfoKV() = default;
271+
272+
void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); }
273+
bool Verify(uint8_t len, const char* checksum_ptr) const {
274+
return info_.Verify(len, checksum_ptr);
275+
}
276+
277+
private:
278+
friend class ProtectionInfo<T>;
279+
280+
explicit ProtectionInfoKV(T val) : info_(val) {
281+
static_assert(sizeof(ProtectionInfoKV<T>) == sizeof(T));
282+
}
283+
284+
ProtectionInfo<T> info_;
285+
};
286+
210287
template <typename T>
211288
Status ProtectionInfo<T>::GetStatus() const {
212289
if (val_ != 0) {
@@ -244,6 +321,16 @@ ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const SliceParts& key,
244321
return ProtectionInfoKVO<T>(val);
245322
}
246323

324+
template <typename T>
325+
ProtectionInfoKV<T> ProtectionInfo<T>::ProtectKV(const Slice& key,
326+
const Slice& value) const {
327+
T val = GetVal();
328+
val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
329+
val =
330+
val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
331+
return ProtectionInfoKV<T>(val);
332+
}
333+
247334
template <typename T>
248335
void ProtectionInfoKVO<T>::UpdateK(const Slice& old_key, const Slice& new_key) {
249336
T val = GetVal();
@@ -394,5 +481,4 @@ void ProtectionInfoKVOS<T>::UpdateS(SequenceNumber old_sequence_number,
394481
sizeof(new_sequence_number), ProtectionInfo<T>::kSeedS));
395482
SetVal(val);
396483
}
397-
398484
} // namespace ROCKSDB_NAMESPACE

0 commit comments

Comments
 (0)