Skip to content

Commit ac1d69d

Browse files
ghemawatcmumford
authored andcommitted
LevelDB now attempts to reuse the preceding MANIFEST and log file when re-opened.
(Based on a suggestion by cmumford.) "open" benchmark on my workstation speeds up significantly since we can now avoid three fdatasync calls and a compaction per open: Before: ~80000 microseconds After: ~130 microseconds Details: (1) Added Options::reuse_logs (currently defaults to false) to control new behavior. The intention is to change the default to true after some baking. (2) Added Env::NewAppendableFile() whose default implementation returns a not-supported error. (3) VersionSet::Recovery attempts to reuse the MANIFEST from which it is recovering. (4) DBImpl recovery attempts to reuse the last log file and memtable. (5) db_test.cc now tests a new configuration that sets reuse_logs to true. (6) fault_injection_test also tests a reuse_logs==true config. (7) Added a new recovery_test.
1 parent 77948e7 commit ac1d69d

22 files changed

+707
-155
lines changed

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ TESTS = \
5757
issue200_test \
5858
log_test \
5959
memenv_test \
60+
recovery_test \
6061
skiplist_test \
6162
table_test \
6263
version_edit_test \
@@ -177,6 +178,9 @@ issue200_test: issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS)
177178
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
178179
$(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
179180

181+
recovery_test: db/recovery_test.o $(LIBOBJECTS) $(TESTHARNESS)
182+
$(CXX) $(LDFLAGS) db/recovery_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
183+
180184
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
181185
$(CXX) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
182186

db/corruption_test.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class CorruptionTest {
3636
tiny_cache_ = NewLRUCache(100);
3737
options_.env = &env_;
3838
options_.block_cache = tiny_cache_;
39-
dbname_ = test::TmpDir() + "/db_test";
39+
dbname_ = test::TmpDir() + "/corruption_test";
4040
DestroyDB(dbname_, options_);
4141

4242
db_ = NULL;

db/db_bench.cc

+7
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ static int FLAGS_bloom_bits = -1;
100100
// benchmark will fail.
101101
static bool FLAGS_use_existing_db = false;
102102

103+
// If true, reuse existing log/MANIFEST files when re-opening a database.
104+
static bool FLAGS_reuse_logs = false;
105+
103106
// Use the db with the following name.
104107
static const char* FLAGS_db = NULL;
105108

@@ -700,6 +703,7 @@ class Benchmark {
700703
options.write_buffer_size = FLAGS_write_buffer_size;
701704
options.max_open_files = FLAGS_open_files;
702705
options.filter_policy = filter_policy_;
706+
options.reuse_logs = FLAGS_reuse_logs;
703707
Status s = DB::Open(options, FLAGS_db, &db_);
704708
if (!s.ok()) {
705709
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@@ -954,6 +958,9 @@ int main(int argc, char** argv) {
954958
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
955959
(n == 0 || n == 1)) {
956960
FLAGS_use_existing_db = n;
961+
} else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 &&
962+
(n == 0 || n == 1)) {
963+
FLAGS_reuse_logs = n;
957964
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
958965
FLAGS_num = n;
959966
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {

db/db_impl.cc

+109-69
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
125125
db_lock_(NULL),
126126
shutting_down_(NULL),
127127
bg_cv_(&mutex_),
128-
mem_(new MemTable(internal_comparator_)),
128+
mem_(NULL),
129129
imm_(NULL),
130130
logfile_(NULL),
131131
logfile_number_(0),
@@ -134,7 +134,6 @@ DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
134134
tmp_batch_(new WriteBatch),
135135
bg_compaction_scheduled_(false),
136136
manual_compaction_(NULL) {
137-
mem_->Ref();
138137
has_imm_.Release_Store(NULL);
139138

140139
// Reserve ten files or so for other uses and give the rest to TableCache.
@@ -271,7 +270,7 @@ void DBImpl::DeleteObsoleteFiles() {
271270
}
272271
}
273272

274-
Status DBImpl::Recover(VersionEdit* edit) {
273+
Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) {
275274
mutex_.AssertHeld();
276275

277276
// Ignore error from CreateDir since the creation of the DB is
@@ -301,66 +300,69 @@ Status DBImpl::Recover(VersionEdit* edit) {
301300
}
302301
}
303302

304-
s = versions_->Recover();
305-
if (s.ok()) {
306-
SequenceNumber max_sequence(0);
307-
308-
// Recover from all newer log files than the ones named in the
309-
// descriptor (new log files may have been added by the previous
310-
// incarnation without registering them in the descriptor).
311-
//
312-
// Note that PrevLogNumber() is no longer used, but we pay
313-
// attention to it in case we are recovering a database
314-
// produced by an older version of leveldb.
315-
const uint64_t min_log = versions_->LogNumber();
316-
const uint64_t prev_log = versions_->PrevLogNumber();
317-
std::vector<std::string> filenames;
318-
s = env_->GetChildren(dbname_, &filenames);
303+
s = versions_->Recover(save_manifest);
304+
if (!s.ok()) {
305+
return s;
306+
}
307+
SequenceNumber max_sequence(0);
308+
309+
// Recover from all newer log files than the ones named in the
310+
// descriptor (new log files may have been added by the previous
311+
// incarnation without registering them in the descriptor).
312+
//
313+
// Note that PrevLogNumber() is no longer used, but we pay
314+
// attention to it in case we are recovering a database
315+
// produced by an older version of leveldb.
316+
const uint64_t min_log = versions_->LogNumber();
317+
const uint64_t prev_log = versions_->PrevLogNumber();
318+
std::vector<std::string> filenames;
319+
s = env_->GetChildren(dbname_, &filenames);
320+
if (!s.ok()) {
321+
return s;
322+
}
323+
std::set<uint64_t> expected;
324+
versions_->AddLiveFiles(&expected);
325+
uint64_t number;
326+
FileType type;
327+
std::vector<uint64_t> logs;
328+
for (size_t i = 0; i < filenames.size(); i++) {
329+
if (ParseFileName(filenames[i], &number, &type)) {
330+
expected.erase(number);
331+
if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
332+
logs.push_back(number);
333+
}
334+
}
335+
if (!expected.empty()) {
336+
char buf[50];
337+
snprintf(buf, sizeof(buf), "%d missing files; e.g.",
338+
static_cast<int>(expected.size()));
339+
return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
340+
}
341+
342+
// Recover in the order in which the logs were generated
343+
std::sort(logs.begin(), logs.end());
344+
for (size_t i = 0; i < logs.size(); i++) {
345+
s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
346+
&max_sequence);
319347
if (!s.ok()) {
320348
return s;
321349
}
322-
std::set<uint64_t> expected;
323-
versions_->AddLiveFiles(&expected);
324-
uint64_t number;
325-
FileType type;
326-
std::vector<uint64_t> logs;
327-
for (size_t i = 0; i < filenames.size(); i++) {
328-
if (ParseFileName(filenames[i], &number, &type)) {
329-
expected.erase(number);
330-
if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
331-
logs.push_back(number);
332-
}
333-
}
334-
if (!expected.empty()) {
335-
char buf[50];
336-
snprintf(buf, sizeof(buf), "%d missing files; e.g.",
337-
static_cast<int>(expected.size()));
338-
return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
339-
}
340-
341-
// Recover in the order in which the logs were generated
342-
std::sort(logs.begin(), logs.end());
343-
for (size_t i = 0; i < logs.size(); i++) {
344-
s = RecoverLogFile(logs[i], edit, &max_sequence);
345350

346-
// The previous incarnation may not have written any MANIFEST
347-
// records after allocating this log number. So we manually
348-
// update the file number allocation counter in VersionSet.
349-
versions_->MarkFileNumberUsed(logs[i]);
350-
}
351+
// The previous incarnation may not have written any MANIFEST
352+
// records after allocating this log number. So we manually
353+
// update the file number allocation counter in VersionSet.
354+
versions_->MarkFileNumberUsed(logs[i]);
355+
}
351356

352-
if (s.ok()) {
353-
if (versions_->LastSequence() < max_sequence) {
354-
versions_->SetLastSequence(max_sequence);
355-
}
356-
}
357+
if (versions_->LastSequence() < max_sequence) {
358+
versions_->SetLastSequence(max_sequence);
357359
}
358360

359-
return s;
361+
return Status::OK();
360362
}
361363

362-
Status DBImpl::RecoverLogFile(uint64_t log_number,
363-
VersionEdit* edit,
364+
Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
365+
bool* save_manifest, VersionEdit* edit,
364366
SequenceNumber* max_sequence) {
365367
struct LogReporter : public log::Reader::Reporter {
366368
Env* env;
@@ -405,6 +407,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
405407
std::string scratch;
406408
Slice record;
407409
WriteBatch batch;
410+
int compactions = 0;
408411
MemTable* mem = NULL;
409412
while (reader.ReadRecord(&record, &scratch) &&
410413
status.ok()) {
@@ -432,25 +435,52 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
432435
}
433436

434437
if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
438+
compactions++;
439+
*save_manifest = true;
435440
status = WriteLevel0Table(mem, edit, NULL);
441+
mem->Unref();
442+
mem = NULL;
436443
if (!status.ok()) {
437444
// Reflect errors immediately so that conditions like full
438445
// file-systems cause the DB::Open() to fail.
439446
break;
440447
}
441-
mem->Unref();
442-
mem = NULL;
443448
}
444449
}
445450

446-
if (status.ok() && mem != NULL) {
447-
status = WriteLevel0Table(mem, edit, NULL);
448-
// Reflect errors immediately so that conditions like full
449-
// file-systems cause the DB::Open() to fail.
451+
delete file;
452+
453+
// See if we should keep reusing the last log file.
454+
if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
455+
assert(logfile_ == NULL);
456+
assert(log_ == NULL);
457+
assert(mem_ == NULL);
458+
uint64_t lfile_size;
459+
if (env_->GetFileSize(fname, &lfile_size).ok() &&
460+
env_->NewAppendableFile(fname, &logfile_).ok()) {
461+
Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
462+
log_ = new log::Writer(logfile_, lfile_size);
463+
logfile_number_ = log_number;
464+
if (mem != NULL) {
465+
mem_ = mem;
466+
mem = NULL;
467+
} else {
468+
// mem can be NULL if lognum exists but was empty.
469+
mem_ = new MemTable(internal_comparator_);
470+
mem_->Ref();
471+
}
472+
}
473+
}
474+
475+
if (mem != NULL) {
476+
// mem did not get reused; compact it.
477+
if (status.ok()) {
478+
*save_manifest = true;
479+
status = WriteLevel0Table(mem, edit, NULL);
480+
}
481+
mem->Unref();
450482
}
451483

452-
if (mem != NULL) mem->Unref();
453-
delete file;
454484
return status;
455485
}
456486

@@ -1449,8 +1479,11 @@ Status DB::Open(const Options& options, const std::string& dbname,
14491479
DBImpl* impl = new DBImpl(options, dbname);
14501480
impl->mutex_.Lock();
14511481
VersionEdit edit;
1452-
Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
1453-
if (s.ok()) {
1482+
// Recover handles create_if_missing, error_if_exists
1483+
bool save_manifest = false;
1484+
Status s = impl->Recover(&edit, &save_manifest);
1485+
if (s.ok() && impl->mem_ == NULL) {
1486+
// Create new log and a corresponding memtable.
14541487
uint64_t new_log_number = impl->versions_->NewFileNumber();
14551488
WritableFile* lfile;
14561489
s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
@@ -1460,15 +1493,22 @@ Status DB::Open(const Options& options, const std::string& dbname,
14601493
impl->logfile_ = lfile;
14611494
impl->logfile_number_ = new_log_number;
14621495
impl->log_ = new log::Writer(lfile);
1463-
s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
1464-
}
1465-
if (s.ok()) {
1466-
impl->DeleteObsoleteFiles();
1467-
impl->MaybeScheduleCompaction();
1496+
impl->mem_ = new MemTable(impl->internal_comparator_);
1497+
impl->mem_->Ref();
14681498
}
14691499
}
1500+
if (s.ok() && save_manifest) {
1501+
edit.SetPrevLogNumber(0); // No older logs needed after recovery.
1502+
edit.SetLogNumber(impl->logfile_number_);
1503+
s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
1504+
}
1505+
if (s.ok()) {
1506+
impl->DeleteObsoleteFiles();
1507+
impl->MaybeScheduleCompaction();
1508+
}
14701509
impl->mutex_.Unlock();
14711510
if (s.ok()) {
1511+
assert(impl->mem_ != NULL);
14721512
*dbptr = impl;
14731513
} else {
14741514
delete impl;

db/db_impl.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ class DBImpl : public DB {
7878
// Recover the descriptor from persistent storage. May do a significant
7979
// amount of work to recover recently logged updates. Any changes to
8080
// be made to the descriptor are added to *edit.
81-
Status Recover(VersionEdit* edit) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
81+
Status Recover(VersionEdit* edit, bool* save_manifest)
82+
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
8283

8384
void MaybeIgnoreError(Status* s) const;
8485

@@ -90,9 +91,8 @@ class DBImpl : public DB {
9091
// Errors are recorded in bg_error_.
9192
void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
9293

93-
Status RecoverLogFile(uint64_t log_number,
94-
VersionEdit* edit,
95-
SequenceNumber* max_sequence)
94+
Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest,
95+
VersionEdit* edit, SequenceNumber* max_sequence)
9696
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
9797

9898
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)

db/db_test.cc

+20-1
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ class DBTest {
193193
// Sequence of option configurations to try
194194
enum OptionConfig {
195195
kDefault,
196+
kReuse,
196197
kFilter,
197198
kUncompressed,
198199
kEnd
@@ -237,7 +238,11 @@ class DBTest {
237238
// Return the current option configuration.
238239
Options CurrentOptions() {
239240
Options options;
241+
options.reuse_logs = false;
240242
switch (option_config_) {
243+
case kReuse:
244+
options.reuse_logs = true;
245+
break;
241246
case kFilter:
242247
options.filter_policy = filter_policy_;
243248
break;
@@ -1080,6 +1085,14 @@ TEST(DBTest, ApproximateSizes) {
10801085
// 0 because GetApproximateSizes() does not account for memtable space
10811086
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
10821087

1088+
if (options.reuse_logs) {
1089+
// Recovery will reuse memtable, and GetApproximateSizes() does not
1090+
// account for memtable usage;
1091+
Reopen(&options);
1092+
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
1093+
continue;
1094+
}
1095+
10831096
// Check sizes across recovery by reopening a few times
10841097
for (int run = 0; run < 3; run++) {
10851098
Reopen(&options);
@@ -1123,6 +1136,11 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
11231136
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
11241137
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
11251138

1139+
if (options.reuse_logs) {
1140+
// Need to force a memtable compaction since recovery does not do so.
1141+
ASSERT_OK(dbfull()->TEST_CompactMemTable());
1142+
}
1143+
11261144
// Check sizes across recovery by reopening a few times
11271145
for (int run = 0; run < 3; run++) {
11281146
Reopen(&options);
@@ -2084,7 +2102,8 @@ void BM_LogAndApply(int iters, int num_base_files) {
20842102
InternalKeyComparator cmp(BytewiseComparator());
20852103
Options options;
20862104
VersionSet vset(dbname, &options, NULL, &cmp);
2087-
ASSERT_OK(vset.Recover());
2105+
bool save_manifest;
2106+
ASSERT_OK(vset.Recover(&save_manifest));
20882107
VersionEdit vbase;
20892108
uint64_t fnum = 1;
20902109
for (int i = 0; i < num_base_files; i++) {

0 commit comments

Comments
 (0)