From 2546ba5aee291d11139560e2d016f23a41af9d1f Mon Sep 17 00:00:00 2001 From: Kezhu Wang Date: Wed, 18 May 2016 13:35:51 +0800 Subject: [PATCH] Rotate compaction output at user key boundary so that keys with same user key will not spread over multiple tables in same level, level 1 or above. Level compaction, except for level-0, don't pick overlapping tables. So if we produce overlapping tables in user key level, it is possible that file with newer version user key got picked and compacted to next level. Now, the invariant that Version::Get depends on is broken, Version::Get is broken. The vulnerability of this approach is that a malicious client can retain a snapshhot and do large number of writes to same key, which can cause huge sorted table generated. LevelDB is a library that embedded in other application. I think it should be the application's responsibility to keep snapshhot from malicious clients. Picking overlapping tables in all level compactions is another approach to solve this problem. I think this approach may violate the algorithm this implementation implies. --- db/db_impl.cc | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 60f4e66e55..712f99c862 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -926,16 +926,10 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } Slice key = input->key(); - if (compact->compaction->ShouldStopBefore(key) && - compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } // Handle key/value, add to state, etc. bool drop = false; + bool first_occurrence = false; if (!ParseInternalKey(key, &ikey)) { // Do not hide error keys current_user_key.clear(); @@ -947,6 +941,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { Slice(current_user_key)) != 0) { // First occurrence of this user key current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); + // Don't rotate at this key if last key is an error key. + first_occurrence = last_sequence_for_key != kMaxSequenceNumber; has_current_user_key = true; last_sequence_for_key = kMaxSequenceNumber; } @@ -980,6 +976,19 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { #endif if (!drop) { + // Close output file if it is overlapping too much grandparents + // or big enough. We never put keys with same user key to + // different files in same level in level compaction. + if (compact->builder != NULL && first_occurrence && + (compact->compaction->ShouldStopBefore(key) || + compact->builder->FileSize() >= + compact->compaction->MaxOutputFileSize())) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + // Open output file if necessary if (compact->builder == NULL) { status = OpenCompactionOutputFile(compact); @@ -992,15 +1001,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } compact->current_output()->largest.DecodeFrom(key); compact->builder->Add(key, input->value()); - - // Close output file if it is big enough - if (compact->builder->FileSize() >= - compact->compaction->MaxOutputFileSize()) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } } input->Next();