From 6e7fa8f1576df6db9e2c302aabfac39ad3d8a3b1 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 28 Jan 2020 22:09:05 -0500 Subject: [PATCH 1/3] c-deps/libroach: run clang-format This commit commits the changes from `make c-deps-fmt`. This hadn't been done in a while. I don't want the diff leaking into the next commit. I was surprised to see that we don't lint against this. --- c-deps/libroach/batch.cc | 52 ++++++++++++---- c-deps/libroach/batch.h | 6 +- c-deps/libroach/ccl/db_test.cc | 6 +- c-deps/libroach/chunked_buffer.cc | 2 +- c-deps/libroach/chunked_buffer_test.cc | 8 +-- c-deps/libroach/comparator.cc | 13 ++-- c-deps/libroach/comparator_test.cc | 86 +++++++++++++------------- c-deps/libroach/db.cc | 10 +-- c-deps/libroach/engine.cc | 7 +-- c-deps/libroach/engine.h | 8 ++- c-deps/libroach/include/libroach.h | 10 +-- c-deps/libroach/iterator.cc | 10 ++- c-deps/libroach/mvcc.cc | 10 +-- c-deps/libroach/mvcc.h | 82 ++++++++++++------------ c-deps/libroach/options.cc | 54 ++++++++-------- c-deps/libroach/row_counter.cc | 1 - c-deps/libroach/snapshot.cc | 25 ++++++-- c-deps/libroach/snapshot.h | 3 +- c-deps/libroach/sst_dump.cc | 2 +- c-deps/libroach/table_props.cc | 7 +-- 20 files changed, 221 insertions(+), 181 deletions(-) diff --git a/c-deps/libroach/batch.cc b/c-deps/libroach/batch.cc index 4a62c7e697be..113c737f4312 100644 --- a/c-deps/libroach/batch.cc +++ b/c-deps/libroach/batch.cc @@ -587,14 +587,25 @@ DBStatus DBBatch::EnvDeleteDirAndFiles(DBSlice dir) { return FmtStatus("unsuppor DBStatus DBBatch::EnvLinkFile(DBSlice oldname, DBSlice newname) { return FmtStatus("unsupported"); } -DBStatus DBBatch::EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) { return FmtStatus("unsupported"); } -DBStatus DBBatch::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n) { return FmtStatus("unsupported"); } -DBStatus DBBatch::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) { return FmtStatus("unsupported"); } -DBStatus DBBatch::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) { return FmtStatus("unsupported"); } +DBStatus DBBatch::EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) { + return FmtStatus("unsupported"); +} +DBStatus DBBatch::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, + int* n) { + return FmtStatus("unsupported"); +} +DBStatus DBBatch::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) { + return FmtStatus("unsupported"); +} +DBStatus DBBatch::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) { + return FmtStatus("unsupported"); +} DBStatus DBBatch::EnvSyncDirectory(rocksdb::Directory* file) { return FmtStatus("unsupported"); } DBStatus DBBatch::EnvCloseDirectory(rocksdb::Directory* file) { return FmtStatus("unsupported"); } -DBStatus DBBatch::EnvRenameFile(DBSlice oldname, DBSlice newname) { return FmtStatus("unsupported"); } - +DBStatus DBBatch::EnvRenameFile(DBSlice oldname, DBSlice newname) { + return FmtStatus("unsupported"); +} + DBWriteOnlyBatch::DBWriteOnlyBatch(DBEngine* db) : DBEngine(db->rep, db->iters), updates(0) {} DBWriteOnlyBatch::~DBWriteOnlyBatch() {} @@ -706,13 +717,28 @@ DBStatus DBWriteOnlyBatch::EnvLinkFile(DBSlice oldname, DBSlice newname) { return FmtStatus("unsupported"); } -DBStatus DBWriteOnlyBatch::EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) { return FmtStatus("unsupported"); } -DBStatus DBWriteOnlyBatch::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n) { return FmtStatus("unsupported"); } -DBStatus DBWriteOnlyBatch::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) { return FmtStatus("unsupported"); } -DBStatus DBWriteOnlyBatch::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) { return FmtStatus("unsupported"); } -DBStatus DBWriteOnlyBatch::EnvSyncDirectory(rocksdb::Directory* file) { return FmtStatus("unsupported"); } -DBStatus DBWriteOnlyBatch::EnvCloseDirectory(rocksdb::Directory* file) { return FmtStatus("unsupported"); } -DBStatus DBWriteOnlyBatch::EnvRenameFile(DBSlice oldname, DBSlice newname) { return FmtStatus("unsupported"); } +DBStatus DBWriteOnlyBatch::EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) { + return FmtStatus("unsupported"); +} +DBStatus DBWriteOnlyBatch::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, + int64_t offset, int* n) { + return FmtStatus("unsupported"); +} +DBStatus DBWriteOnlyBatch::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) { + return FmtStatus("unsupported"); +} +DBStatus DBWriteOnlyBatch::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) { + return FmtStatus("unsupported"); +} +DBStatus DBWriteOnlyBatch::EnvSyncDirectory(rocksdb::Directory* file) { + return FmtStatus("unsupported"); +} +DBStatus DBWriteOnlyBatch::EnvCloseDirectory(rocksdb::Directory* file) { + return FmtStatus("unsupported"); +} +DBStatus DBWriteOnlyBatch::EnvRenameFile(DBSlice oldname, DBSlice newname) { + return FmtStatus("unsupported"); +} rocksdb::WriteBatch::Handler* GetDBBatchInserter(::rocksdb::WriteBatchBase* batch) { return new DBBatchInserter(batch); diff --git a/c-deps/libroach/batch.h b/c-deps/libroach/batch.h index 4d159524f605..1f055ef0d088 100644 --- a/c-deps/libroach/batch.h +++ b/c-deps/libroach/batch.h @@ -50,7 +50,8 @@ struct DBBatch : public DBEngine { virtual DBStatus EnvDeleteDirAndFiles(DBSlice dir); virtual DBStatus EnvLinkFile(DBSlice oldname, DBSlice newname); virtual DBStatus EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file); - virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n); + virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, + int* n); virtual DBStatus EnvCloseReadableFile(rocksdb::RandomAccessFile* file); virtual DBStatus EnvOpenDirectory(DBSlice path, rocksdb::Directory** file); virtual DBStatus EnvSyncDirectory(rocksdb::Directory* file); @@ -90,7 +91,8 @@ struct DBWriteOnlyBatch : public DBEngine { virtual DBStatus EnvDeleteDirAndFiles(DBSlice dir); virtual DBStatus EnvLinkFile(DBSlice oldname, DBSlice newname); virtual DBStatus EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file); - virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n); + virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, + int* n); virtual DBStatus EnvCloseReadableFile(rocksdb::RandomAccessFile* file); virtual DBStatus EnvOpenDirectory(DBSlice path, rocksdb::Directory** file); virtual DBStatus EnvSyncDirectory(rocksdb::Directory* file); diff --git a/c-deps/libroach/ccl/db_test.cc b/c-deps/libroach/ccl/db_test.cc index 3cd7032a48f3..f4871c0c7f89 100644 --- a/c-deps/libroach/ccl/db_test.cc +++ b/c-deps/libroach/ccl/db_test.cc @@ -167,8 +167,7 @@ TEST_F(CCLTest, ReadOnly) { free(ro_value.data); // Try to write it again. auto ret = DBPut(db, ToDBKey("foo"), ToDBSlice("foo's value")); - EXPECT_EQ(ToString(ret), - "Not implemented: Not supported operation in read only mode."); + EXPECT_EQ(ToString(ret), "Not implemented: Not supported operation in read only mode."); free(ret.data); DBClose(db); @@ -192,8 +191,7 @@ TEST_F(CCLTest, ReadOnly) { free(ro_value.data); // Try to write it again. auto ret = DBPut(db, ToDBKey("foo"), ToDBSlice("foo's value")); - EXPECT_EQ(ToString(ret), - "Not implemented: Not supported operation in read only mode."); + EXPECT_EQ(ToString(ret), "Not implemented: Not supported operation in read only mode."); free(ret.data); DBClose(db); diff --git a/c-deps/libroach/chunked_buffer.cc b/c-deps/libroach/chunked_buffer.cc index fefe05716da7..aa580333b6dc 100644 --- a/c-deps/libroach/chunked_buffer.cc +++ b/c-deps/libroach/chunked_buffer.cc @@ -59,7 +59,7 @@ void chunkedBuffer::put(const char* data, int len, int next_size_hint) { data += avail; len -= avail; - const int max_size = 128 << 20; // 128 MB + const int max_size = 128 << 20; // 128 MB size_t new_size = bufs_.empty() ? 16 : bufs_.back().len * 2; for (; new_size < len + next_size_hint && new_size < max_size; new_size *= 2) { } diff --git a/c-deps/libroach/chunked_buffer_test.cc b/c-deps/libroach/chunked_buffer_test.cc index 16a1cb95c9d6..29fbea1f4b29 100644 --- a/c-deps/libroach/chunked_buffer_test.cc +++ b/c-deps/libroach/chunked_buffer_test.cc @@ -15,8 +15,8 @@ // writing in pieces that are smaller than the maximum chunk size (128 // MB). See #32896. TEST(ChunkedBuffer, PutSmall) { - const std::string data(1 << 20, '.'); // 1 MB - const int64_t total = 3LL << 30; // 3 GB + const std::string data(1 << 20, '.'); // 1 MB + const int64_t total = 3LL << 30; // 3 GB cockroach::chunkedBuffer buf; for (int64_t sum = 0; sum < total; sum += data.size()) { buf.Put(data, data); @@ -27,8 +27,8 @@ TEST(ChunkedBuffer, PutSmall) { // writing in pieces that are larger than the maximum chunk size (128 // MB). See #32896. TEST(ChunkedBuffer, PutLarge) { - const std::string data(256 << 20, '.'); // 256 MB - const int64_t total = 3LL << 30; // 3 GB + const std::string data(256 << 20, '.'); // 256 MB + const int64_t total = 3LL << 30; // 3 GB cockroach::chunkedBuffer buf; for (int64_t sum = 0; sum < total; sum += data.size()) { buf.Put(data, data); diff --git a/c-deps/libroach/comparator.cc b/c-deps/libroach/comparator.cc index 2f21fe2cb867..261bb24ab75b 100644 --- a/c-deps/libroach/comparator.cc +++ b/c-deps/libroach/comparator.cc @@ -41,14 +41,13 @@ bool DBComparator::Equal(const rocksdb::Slice& a, const rocksdb::Slice& b) const namespace { -void ShrinkSlice(rocksdb::Slice* a, size_t size) { - a->remove_suffix(a->size() - size); -} +void ShrinkSlice(rocksdb::Slice* a, size_t size) { a->remove_suffix(a->size() - size); } int SharedPrefixLen(const rocksdb::Slice& a, const rocksdb::Slice& b) { auto n = std::min(a.size(), b.size()); int i = 0; - for (; i < n && a[i] == b[i]; ++i) {} + for (; i < n && a[i] == b[i]; ++i) { + } return i; } @@ -68,7 +67,8 @@ bool FindSeparator(rocksdb::Slice* a, std::string* a_backing, const rocksdb::Sli // So b is smaller than a. return false; } - if ((prefix < b.size() - 1) || static_cast((*a)[prefix]) + 1 < static_cast(b[prefix])) { + if ((prefix < b.size() - 1) || + static_cast((*a)[prefix]) + 1 < static_cast(b[prefix])) { // a and b do not have consecutive characters at prefix. (*a_backing)[prefix]++; ShrinkSlice(a, prefix + 1); @@ -97,7 +97,8 @@ void DBComparator::FindShortestSeparator(std::string* start, const rocksdb::Slic return; } auto found = FindSeparator(&key_s, start, key_l); - if (!found) return; + if (!found) + return; start->resize(key_s.size() + 1); (*start)[key_s.size()] = 0x00; } diff --git a/c-deps/libroach/comparator_test.cc b/c-deps/libroach/comparator_test.cc index 284e89426be0..50691897ed74 100644 --- a/c-deps/libroach/comparator_test.cc +++ b/c-deps/libroach/comparator_test.cc @@ -40,39 +40,39 @@ DBKey makeKey(const char* s, int64_t wall_time = 0, int32_t logical = 0) { TEST(Libroach, Comparator) { DBComparator comp; std::vector sepCases = { - // Many cases here are adapted from a Pebble unit test. + // Many cases here are adapted from a Pebble unit test. - // Non-empty b values. - {makeKey("black"), makeKey("blue"), makeKey("blb")}, - {makeKey(""), makeKey("2"), makeKey("")}, - {makeKey("1"), makeKey("2"), makeKey("1")}, - {makeKey("1"), makeKey("29"), makeKey("2")}, - {makeKey("13"), makeKey("19"), makeKey("14")}, - {makeKey("13"), makeKey("99"), makeKey("2")}, - {makeKey("135"), makeKey("19"), makeKey("14")}, - {makeKey("1357"), makeKey("19"), makeKey("14")}, - {makeKey("1357"), makeKey("2"), makeKey("14")}, - {makeKey("13\xff"), makeKey("14"), makeKey("13\xff")}, - {makeKey("13\xff"), makeKey("19"), makeKey("14")}, - {makeKey("1\xff\xff"), makeKey("19"), makeKey("1\xff\xff")}, - {makeKey("1\xff\xff"), makeKey("2"), makeKey("1\xff\xff")}, - {makeKey("1\xff\xff"), makeKey("9"), makeKey("2")}, - {makeKey("1\xfd\xff"), makeKey("1\xff"), makeKey("1\xfe")}, - {makeKey("1\xff\xff", 20, 3), makeKey("9"), makeKey("2")}, - {makeKey("1\xff\xff", 20, 3), makeKey("19"), makeKey("1\xff\xff", 20, 3)}, + // Non-empty b values. + {makeKey("black"), makeKey("blue"), makeKey("blb")}, + {makeKey(""), makeKey("2"), makeKey("")}, + {makeKey("1"), makeKey("2"), makeKey("1")}, + {makeKey("1"), makeKey("29"), makeKey("2")}, + {makeKey("13"), makeKey("19"), makeKey("14")}, + {makeKey("13"), makeKey("99"), makeKey("2")}, + {makeKey("135"), makeKey("19"), makeKey("14")}, + {makeKey("1357"), makeKey("19"), makeKey("14")}, + {makeKey("1357"), makeKey("2"), makeKey("14")}, + {makeKey("13\xff"), makeKey("14"), makeKey("13\xff")}, + {makeKey("13\xff"), makeKey("19"), makeKey("14")}, + {makeKey("1\xff\xff"), makeKey("19"), makeKey("1\xff\xff")}, + {makeKey("1\xff\xff"), makeKey("2"), makeKey("1\xff\xff")}, + {makeKey("1\xff\xff"), makeKey("9"), makeKey("2")}, + {makeKey("1\xfd\xff"), makeKey("1\xff"), makeKey("1\xfe")}, + {makeKey("1\xff\xff", 20, 3), makeKey("9"), makeKey("2")}, + {makeKey("1\xff\xff", 20, 3), makeKey("19"), makeKey("1\xff\xff", 20, 3)}, - // Empty b values - {makeKey(""), makeKey(""), makeKey("")}, - {makeKey("green"), makeKey(""), makeKey("green")}, - {makeKey("1"), makeKey(""), makeKey("1")}, - {makeKey("11\xff"), makeKey(""), makeKey("11\xff")}, - {makeKey("1\xff"), makeKey(""), makeKey("1\xff")}, - {makeKey("1\xff\xff"), makeKey(""), makeKey("1\xff\xff")}, - {makeKey("\xff"), makeKey(""), makeKey("\xff")}, - {makeKey("\xff\xff"), makeKey(""), makeKey("\xff\xff")}, + // Empty b values + {makeKey(""), makeKey(""), makeKey("")}, + {makeKey("green"), makeKey(""), makeKey("green")}, + {makeKey("1"), makeKey(""), makeKey("1")}, + {makeKey("11\xff"), makeKey(""), makeKey("11\xff")}, + {makeKey("1\xff"), makeKey(""), makeKey("1\xff")}, + {makeKey("1\xff\xff"), makeKey(""), makeKey("1\xff\xff")}, + {makeKey("\xff"), makeKey(""), makeKey("\xff")}, + {makeKey("\xff\xff"), makeKey(""), makeKey("\xff\xff")}, }; - - for (const auto& c: sepCases) { + + for (const auto& c : sepCases) { auto a_str = EncodeKey(c.a); auto b_str = EncodeKey(c.b); std::printf("a_str: %s, b_str: %s\n", a_str.c_str(), b_str.c_str()); @@ -81,21 +81,21 @@ TEST(Libroach, Comparator) { } std::vector succCases = { - {makeKey("black"), makeKey("c")}, - {makeKey("green"), makeKey("h")}, - {makeKey(""), makeKey("")}, - {makeKey("13"), makeKey("2")}, - {makeKey("135"), makeKey("2")}, - {makeKey("13\xff"), makeKey("2")}, - {makeKey("1\xff\xff", 20, 3), makeKey("2")}, - {makeKey("\xff"), makeKey("\xff")}, - {makeKey("\xff\xff"), makeKey("\xff\xff")}, - {makeKey("\xff\xff\xff"), makeKey("\xff\xff\xff")}, - {makeKey("\xfe\xff\xff"), makeKey("\xff")}, - {makeKey("\xff\xff", 20, 3), makeKey("\xff\xff", 20, 3)}, + {makeKey("black"), makeKey("c")}, + {makeKey("green"), makeKey("h")}, + {makeKey(""), makeKey("")}, + {makeKey("13"), makeKey("2")}, + {makeKey("135"), makeKey("2")}, + {makeKey("13\xff"), makeKey("2")}, + {makeKey("1\xff\xff", 20, 3), makeKey("2")}, + {makeKey("\xff"), makeKey("\xff")}, + {makeKey("\xff\xff"), makeKey("\xff\xff")}, + {makeKey("\xff\xff\xff"), makeKey("\xff\xff\xff")}, + {makeKey("\xfe\xff\xff"), makeKey("\xff")}, + {makeKey("\xff\xff", 20, 3), makeKey("\xff\xff", 20, 3)}, }; - for (const auto& c: succCases) { + for (const auto& c : succCases) { auto a_str = EncodeKey(c.a); std::printf("a_str: %s\n", a_str.c_str()); comp.FindShortSuccessor(&a_str); diff --git a/c-deps/libroach/db.cc b/c-deps/libroach/db.cc index 5c83ce9904b3..c4adecdabf12 100644 --- a/c-deps/libroach/db.cc +++ b/c-deps/libroach/db.cc @@ -940,7 +940,7 @@ DBSstFileWriter* DBSstFileWriterNew() { // This makes the sstables produced by Pebble and RocksDB byte-by-byte identical, which is // useful for testing. table_options.index_shortening = - rocksdb::BlockBasedTableOptions::IndexShorteningMode::kShortenSeparatorsAndSuccessor; + rocksdb::BlockBasedTableOptions::IndexShorteningMode::kShortenSeparatorsAndSuccessor; rocksdb::Options* options = new rocksdb::Options(); options->comparator = &kComparator; @@ -1101,7 +1101,7 @@ DBStatus DBExportToSst(DBKey start, DBKey end, bool export_all_revisions, uint64 std::string resume_key; // Seek to the MVCC metadata key for the provided start key and let the // incremental iterator find the appropriate version. - const DBKey seek_key = { .key = start.key }; + const DBKey seek_key = {.key = start.key}; for (state = iter.seek(seek_key);; state = iter.next(skip_current_key_versions)) { if (state.status.data != NULL) { DBSstFileWriterClose(writer); @@ -1128,7 +1128,8 @@ DBStatus DBExportToSst(DBKey start, DBKey end, bool export_all_revisions, uint64 // Skip tombstone (len=0) records when start time is zero (non-incremental) // and we are not exporting all versions. - const bool is_skipping_deletes = start.wall_time == 0 && start.logical == 0 && !export_all_revisions; + const bool is_skipping_deletes = + start.wall_time == 0 && start.logical == 0 && !export_all_revisions; if (is_skipping_deletes && iter.value().size() == 0) { continue; } @@ -1178,7 +1179,8 @@ DBStatus DBEnvOpenReadableFile(DBEngine* db, DBSlice path, DBReadableFile* file) return db->EnvOpenReadableFile(path, (rocksdb::RandomAccessFile**)file); } -DBStatus DBEnvReadAtFile(DBEngine* db, DBReadableFile file, DBSlice buffer, int64_t offset, int* n) { +DBStatus DBEnvReadAtFile(DBEngine* db, DBReadableFile file, DBSlice buffer, int64_t offset, + int* n) { return db->EnvReadAtFile((rocksdb::RandomAccessFile*)file, buffer, offset, n); } diff --git a/c-deps/libroach/engine.cc b/c-deps/libroach/engine.cc index a49167a98536..04f0fd77be5f 100644 --- a/c-deps/libroach/engine.cc +++ b/c-deps/libroach/engine.cc @@ -467,7 +467,8 @@ DBStatus DBImpl::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) { return kSuccess; } -DBStatus DBImpl::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n) { +DBStatus DBImpl::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, + int* n) { size_t max_bytes_to_read = buffer.len; char* scratch = buffer.data; rocksdb::Slice result; @@ -488,9 +489,7 @@ DBStatus DBImpl::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) { return kSuccess; } -DBStatus DBImpl::EnvSyncDirectory(rocksdb::Directory* file) { - return ToDBStatus(file->Fsync()); -} +DBStatus DBImpl::EnvSyncDirectory(rocksdb::Directory* file) { return ToDBStatus(file->Fsync()); } DBStatus DBImpl::EnvCloseDirectory(rocksdb::Directory* file) { delete file; diff --git a/c-deps/libroach/engine.h b/c-deps/libroach/engine.h index e8a5f5f19a4a..17295e8ef46d 100644 --- a/c-deps/libroach/engine.h +++ b/c-deps/libroach/engine.h @@ -51,13 +51,14 @@ struct DBEngine { virtual DBStatus EnvDeleteDirAndFiles(DBSlice dir) = 0; virtual DBStatus EnvLinkFile(DBSlice oldname, DBSlice newname) = 0; virtual DBStatus EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) = 0; - virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n) = 0; + virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, + int* n) = 0; virtual DBStatus EnvCloseReadableFile(rocksdb::RandomAccessFile* file) = 0; virtual DBStatus EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) = 0; virtual DBStatus EnvSyncDirectory(rocksdb::Directory* file) = 0; virtual DBStatus EnvCloseDirectory(rocksdb::Directory* file) = 0; virtual DBStatus EnvRenameFile(DBSlice oldname, DBSlice newname) = 0; - + DBSSTable* GetSSTables(int* n); DBStatus GetSortedWALFiles(DBWALFile** out_files, int* n); DBString GetUserProperties(); @@ -107,7 +108,8 @@ struct DBImpl : public DBEngine { virtual DBStatus EnvDeleteDirAndFiles(DBSlice dir); virtual DBStatus EnvLinkFile(DBSlice oldname, DBSlice newname); virtual DBStatus EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file); - virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n); + virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, + int* n); virtual DBStatus EnvCloseReadableFile(rocksdb::RandomAccessFile* file); virtual DBStatus EnvOpenDirectory(DBSlice path, rocksdb::Directory** file); virtual DBStatus EnvSyncDirectory(rocksdb::Directory* file); diff --git a/c-deps/libroach/include/libroach.h b/c-deps/libroach/include/libroach.h index ad916cac3893..ecb7ac3ad3f5 100644 --- a/c-deps/libroach/include/libroach.h +++ b/c-deps/libroach/include/libroach.h @@ -290,7 +290,6 @@ DBStatus DBMergeOne(DBSlice existing, DBSlice update, DBString* new_value); // merged with existing. This method is provided for invocation from Go code. DBStatus DBPartialMergeOne(DBSlice existing, DBSlice update, DBString* new_value); - // NB: The function (cStatsToGoStats) that converts these to the go // representation is unfortunately duplicated in engine and engineccl. If this // struct is changed, both places need to be updated. @@ -322,11 +321,12 @@ MVCCStatsResult MVCCComputeStats(DBIterator* iter, DBKey start, DBKey end, int64 // SST key is greater than or equal to the timestamp of the tombstone, then it // is not considered a collision and we continue iteration from the next key in // the existing data. -DBIterState DBCheckForKeyCollisions(DBIterator* existingIter, DBIterator* sstIter, MVCCStatsResult* skippedKVStats, DBString* write_intent); +DBIterState DBCheckForKeyCollisions(DBIterator* existingIter, DBIterator* sstIter, + MVCCStatsResult* skippedKVStats, DBString* write_intent); bool MVCCIsValidSplitKey(DBSlice key); -DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, - int64_t target_size, DBString* split_key); +DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, int64_t target_size, + DBString* split_key); // DBTxn contains the fields from a roachpb.Transaction that are // necessary for MVCC Get and Scan operations. Note that passing a @@ -501,7 +501,7 @@ DBStatus DBSstFileWriterDeleteRange(DBSstFileWriter* fw, DBKey start, DBKey end) // May be called multiple times. The returned data won't necessarily reflect // the latest writes, only the keys whose underlying RocksDB blocks have been // flushed. Close cannot have been called. -DBStatus DBSstFileWriterTruncate(DBSstFileWriter *fw, DBString* data); +DBStatus DBSstFileWriterTruncate(DBSstFileWriter* fw, DBString* data); // Finalizes the writer and stores the constructed file's contents in *data. At // least one kv entry must have been added. May only be called once. diff --git a/c-deps/libroach/iterator.cc b/c-deps/libroach/iterator.cc index 4894aa4fdec7..598d17b9d2c7 100644 --- a/c-deps/libroach/iterator.cc +++ b/c-deps/libroach/iterator.cc @@ -8,14 +8,15 @@ // by the Apache License, Version 2.0, included in the file // licenses/APL.txt. +#include "iterator.h" #include "chunked_buffer.h" #include "encoding.h" -#include "iterator.h" #include "keys.h" using namespace cockroach; -DBIterator::DBIterator(std::atomic* iters, DBIterOptions iter_options) : iters_count(iters) { +DBIterator::DBIterator(std::atomic* iters, DBIterOptions iter_options) + : iters_count(iters) { read_opts.prefix_same_as_start = iter_options.prefix; read_opts.total_order_seek = !iter_options.prefix; @@ -62,9 +63,7 @@ DBIterator::DBIterator(std::atomic* iters, DBIterOptions iter_options) ++(*iters_count); } -DBIterator::~DBIterator() { - --(*iters_count); -} +DBIterator::~DBIterator() { --(*iters_count); } void DBIterator::SetLowerBound(DBKey key) { if (key.key.data == NULL) { @@ -75,7 +74,6 @@ void DBIterator::SetLowerBound(DBKey key) { lower_bound = lower_bound_str; } - void DBIterator::SetUpperBound(DBKey key) { if (key.key.data == NULL) { upper_bound_str = kMaxKey.data(); diff --git a/c-deps/libroach/mvcc.cc b/c-deps/libroach/mvcc.cc index f96cd5d3983b..c838387ff9f6 100644 --- a/c-deps/libroach/mvcc.cc +++ b/c-deps/libroach/mvcc.cc @@ -213,8 +213,8 @@ MVCCStatsResult MVCCComputeStats(DBIterator* iter, DBKey start, DBKey end, int64 bool MVCCIsValidSplitKey(DBSlice key) { return IsValidSplitKey(ToSlice(key)); } -DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, - int64_t target_size, DBString* split_key) { +DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, int64_t target_size, + DBString* split_key) { auto iter_rep = iter->rep.get(); const std::string start_key = EncodeKey(start); iter_rep->Seek(start_key); @@ -285,10 +285,12 @@ DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp bool tombstones) { ScopedStats scoped_iter(iter); if (reverse) { - mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent, tombstones); + mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent, + tombstones); return scanner.scan(); } else { - mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent, tombstones); + mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent, + tombstones); return scanner.scan(); } } diff --git a/c-deps/libroach/mvcc.h b/c-deps/libroach/mvcc.h index ba634dffbf04..3ba99cfa7e55 100644 --- a/c-deps/libroach/mvcc.h +++ b/c-deps/libroach/mvcc.h @@ -176,25 +176,25 @@ template class mvccScanner { // number are near the start. Until then, the current implementation is // simpler and correct. for (int i = txn_ignored_seqnums_.len - 1; i >= 0; i--) { - if (sequence < txn_ignored_seqnums_.ranges[i].start_seqnum) { - // The history entry's sequence number is lower/older than - // the current ignored range. Go to the previous range - // and try again. - continue; - } + if (sequence < txn_ignored_seqnums_.ranges[i].start_seqnum) { + // The history entry's sequence number is lower/older than + // the current ignored range. Go to the previous range + // and try again. + continue; + } - // Here we have a range where the start seqnum is lower than the current - // intent seqnum. Does it include it? - if (sequence > txn_ignored_seqnums_.ranges[i].end_seqnum) { - // Here we have a range where the current history entry's seqnum - // is higher than the range's end seqnum. Given that the - // ranges are storted, we're guaranteed that there won't - // be any further overlapping range at a lower value of i. - return false; - } - // Yes, it's included. We're going to skip over this - // intent seqnum and retry the search above. - return true; + // Here we have a range where the start seqnum is lower than the current + // intent seqnum. Does it include it? + if (sequence > txn_ignored_seqnums_.ranges[i].end_seqnum) { + // Here we have a range where the current history entry's seqnum + // is higher than the range's end seqnum. Given that the + // ranges are storted, we're guaranteed that there won't + // be any further overlapping range at a lower value of i. + return false; + } + // Yes, it's included. We're going to skip over this + // intent seqnum and retry the search above. + return true; } // Exhausted the ignore list. Not ignored. @@ -217,34 +217,34 @@ template class mvccScanner { meta_.intent_history().begin(), end, readIntent, [](const cockroach::storage::engine::enginepb::MVCCMetadata_SequencedIntent& a, const cockroach::storage::engine::enginepb::MVCCMetadata_SequencedIntent& b) -> bool { - return a.sequence() < b.sequence(); + return a.sequence() < b.sequence(); }); while (up != meta_.intent_history().begin()) { - const auto intent_pos = up - 1; - // Here we have found a history entry with the highest seqnum that's - // equal or lower to the txn seqnum. - // - // However this entry may also be part of an ignored range - // (partially rolled back). We'll check this next. If it is, - // we'll try the previous sequence in the intent history. - if (seqNumIsIgnored(intent_pos->sequence())) { - // This entry was part of an ignored range. Iterate back in intent - // history to the previous sequence, and check if that one is - // ignored. - up--; - continue; - } - // This history entry has not been ignored, so we're going to select - // this version. - intent = *intent_pos; - break; + const auto intent_pos = up - 1; + // Here we have found a history entry with the highest seqnum that's + // equal or lower to the txn seqnum. + // + // However this entry may also be part of an ignored range + // (partially rolled back). We'll check this next. If it is, + // we'll try the previous sequence in the intent history. + if (seqNumIsIgnored(intent_pos->sequence())) { + // This entry was part of an ignored range. Iterate back in intent + // history to the previous sequence, and check if that one is + // ignored. + up--; + continue; + } + // This history entry has not been ignored, so we're going to select + // this version. + intent = *intent_pos; + break; } if (up == meta_.intent_history().begin()) { - // It is possible that no intent exists such that the sequence is less - // than the read sequence. In this case, we cannot read a value from the - // intent history. - return false; + // It is possible that no intent exists such that the sequence is less + // than the read sequence. In this case, we cannot read a value from the + // intent history. + return false; } rocksdb::Slice value = intent.value(); diff --git a/c-deps/libroach/options.cc b/c-deps/libroach/options.cc index 9396d1a67ded..366c141e73d4 100644 --- a/c-deps/libroach/options.cc +++ b/c-deps/libroach/options.cc @@ -13,9 +13,9 @@ #include #include #include -#include "db.h" #include "cache.h" #include "comparator.h" +#include "db.h" #include "encoding.h" #include "godefs.h" #include "merge.h" @@ -51,29 +51,29 @@ class DBLogger : public rocksdb::Logger { va_list ap) override { int go_log_level = util::log::Severity::UNKNOWN; // compiler tells us to initialize it switch (log_level) { - case rocksdb::DEBUG_LEVEL: - // There is no DEBUG severity. Just give it INFO severity, then. - go_log_level = util::log::Severity::INFO; - break; - case rocksdb::INFO_LEVEL: - go_log_level = util::log::Severity::INFO; - break; - case rocksdb::WARN_LEVEL: - go_log_level = util::log::Severity::WARNING; - break; - case rocksdb::ERROR_LEVEL: - go_log_level = util::log::Severity::ERROR; - break; - case rocksdb::FATAL_LEVEL: - go_log_level = util::log::Severity::FATAL; - break; - case rocksdb::HEADER_LEVEL: - // There is no HEADER severity. Just give it INFO severity, then. - go_log_level = util::log::Severity::INFO; - break; - case rocksdb::NUM_INFO_LOG_LEVELS: - assert(false); - return; + case rocksdb::DEBUG_LEVEL: + // There is no DEBUG severity. Just give it INFO severity, then. + go_log_level = util::log::Severity::INFO; + break; + case rocksdb::INFO_LEVEL: + go_log_level = util::log::Severity::INFO; + break; + case rocksdb::WARN_LEVEL: + go_log_level = util::log::Severity::WARNING; + break; + case rocksdb::ERROR_LEVEL: + go_log_level = util::log::Severity::ERROR; + break; + case rocksdb::FATAL_LEVEL: + go_log_level = util::log::Severity::FATAL; + break; + case rocksdb::HEADER_LEVEL: + // There is no HEADER severity. Just give it INFO severity, then. + go_log_level = util::log::Severity::INFO; + break; + case rocksdb::NUM_INFO_LOG_LEVELS: + assert(false); + return; } // First try with a small fixed size buffer. @@ -141,9 +141,7 @@ class DBLogger : public rocksdb::Logger { } // namespace -rocksdb::Logger* NewDBLogger(bool use_primary_log) { - return new DBLogger(use_primary_log); -} +rocksdb::Logger* NewDBLogger(bool use_primary_log) { return new DBLogger(use_primary_log); } rocksdb::Options DBMakeOptions(DBOptions db_opts) { // Use the rocksdb options builder to configure the base options @@ -168,7 +166,7 @@ rocksdb::Options DBMakeOptions(DBOptions db_opts) { // Periodically sync SST writes to smooth out disk usage. Not performing such // syncs can be faster but can cause performance blips when the OS decides it // needs to flush data. - options.bytes_per_sync = 512 << 10; // 512 KB + options.bytes_per_sync = 512 << 10; // 512 KB // Enabling `strict_bytes_per_sync` prevents the situation where an SST is // generated fast enough that the async writeback submissions fall behind. // It enforces we wait for any previous `bytes_per_sync` sync to finish before diff --git a/c-deps/libroach/row_counter.cc b/c-deps/libroach/row_counter.cc index 3b564cce2d8a..993290cbe2e8 100644 --- a/c-deps/libroach/row_counter.cc +++ b/c-deps/libroach/row_counter.cc @@ -92,7 +92,6 @@ bool RowCounter::Count(const rocksdb::Slice& key, cockroach::roachpb::BulkOpSumm return true; } - prev_key.assign(decoded_key.data(), decoded_key.size()); uint64_t tbl; diff --git a/c-deps/libroach/snapshot.cc b/c-deps/libroach/snapshot.cc index 8ffc431f8c9f..dc831ba1afd9 100644 --- a/c-deps/libroach/snapshot.cc +++ b/c-deps/libroach/snapshot.cc @@ -90,12 +90,25 @@ DBStatus DBSnapshot::EnvLinkFile(DBSlice oldname, DBSlice newname) { return FmtStatus("unsupported"); } -DBStatus DBSnapshot::EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) { return FmtStatus("unsupported"); } -DBStatus DBSnapshot::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n) { return FmtStatus("unsupported"); } -DBStatus DBSnapshot::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) { return FmtStatus("unsupported"); } -DBStatus DBSnapshot::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) { return FmtStatus("unsupported"); } +DBStatus DBSnapshot::EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) { + return FmtStatus("unsupported"); +} +DBStatus DBSnapshot::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, + int* n) { + return FmtStatus("unsupported"); +} +DBStatus DBSnapshot::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) { + return FmtStatus("unsupported"); +} +DBStatus DBSnapshot::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) { + return FmtStatus("unsupported"); +} DBStatus DBSnapshot::EnvSyncDirectory(rocksdb::Directory* file) { return FmtStatus("unsupported"); } -DBStatus DBSnapshot::EnvCloseDirectory(rocksdb::Directory* file) { return FmtStatus("unsupported"); } -DBStatus DBSnapshot::EnvRenameFile(DBSlice oldname, DBSlice newname) { return FmtStatus("unsupported"); } +DBStatus DBSnapshot::EnvCloseDirectory(rocksdb::Directory* file) { + return FmtStatus("unsupported"); +} +DBStatus DBSnapshot::EnvRenameFile(DBSlice oldname, DBSlice newname) { + return FmtStatus("unsupported"); +} } // namespace cockroach diff --git a/c-deps/libroach/snapshot.h b/c-deps/libroach/snapshot.h index 87735fbaa943..5ed47d852d1d 100644 --- a/c-deps/libroach/snapshot.h +++ b/c-deps/libroach/snapshot.h @@ -47,7 +47,8 @@ struct DBSnapshot : public DBEngine { virtual DBStatus EnvDeleteDirAndFiles(DBSlice dir); virtual DBStatus EnvLinkFile(DBSlice oldname, DBSlice newname); virtual DBStatus EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file); - virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, int* n); + virtual DBStatus EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, + int* n); virtual DBStatus EnvCloseReadableFile(rocksdb::RandomAccessFile* file); virtual DBStatus EnvOpenDirectory(DBSlice path, rocksdb::Directory** file); virtual DBStatus EnvSyncDirectory(rocksdb::Directory* file); diff --git a/c-deps/libroach/sst_dump.cc b/c-deps/libroach/sst_dump.cc index 6913219d0cf2..46582ba6238a 100644 --- a/c-deps/libroach/sst_dump.cc +++ b/c-deps/libroach/sst_dump.cc @@ -8,8 +8,8 @@ // by the Apache License, Version 2.0, included in the file // licenses/APL.txt. -#include #include +#include void DBRunSSTDump(int argc, char** argv) { rocksdb::SSTDumpTool tool; diff --git a/c-deps/libroach/table_props.cc b/c-deps/libroach/table_props.cc index e0aedc14eff1..a57f60f525f9 100644 --- a/c-deps/libroach/table_props.cc +++ b/c-deps/libroach/table_props.cc @@ -9,9 +9,9 @@ // licenses/APL.txt. #include "table_props.h" -#include #include #include +#include #include #include "encoding.h" @@ -109,9 +109,8 @@ class DeleteRangeTblPropCollector : public rocksdb::TablePropertiesCollector { return rocksdb::Status::OK(); } - rocksdb::Status AddUserKey(const rocksdb::Slice&, const rocksdb::Slice&, - rocksdb::EntryType type, rocksdb::SequenceNumber, - uint64_t) override { + rocksdb::Status AddUserKey(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::EntryType type, + rocksdb::SequenceNumber, uint64_t) override { if (type == rocksdb::kEntryRangeDeletion) { ntombstones_++; } From a25a9c92b4aa4dab7067077e894776bcdd3470da Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Wed, 29 Jan 2020 00:40:09 -0500 Subject: [PATCH 2/3] storage/engine: expose FailOnMoreRecent MVCC{Get/Scan}Option Relates to #40205. This change introduces a new option to `MVCCGetOptions` and `MVCCScanOptions` that causes reads to throw an error if they observe an MVCC version at a timestamp above their read timestamp. Specifically, when the option is enabled, a `WriteTooOldError` will be returned if a scan observes an MVCC version with a timestamp above the read. Similarly, a `WriteIntentError` will be returned if a scan observes another transaction's intent, even if that intent has a timestamp above the scan's read timestamp. This option will be used in the future by `ScanRequests` and `ReverseScanRequests` that intend to acquire unreplicated key-level locks, which will power `SELECT FOR UPDATE`. These scans do not want to ignore existing MVCC versions at higher timestamps like traditional scans do. Instead, they want to throw errors and force their transaction to increase its timestamp through either a refresh or a retry. This was previously prototyped in 4a8e8dc. Interestingly, this is not new logic to the MVCC layer. This behavior is exactly the same as that of the initial key-value lookup performed during MVCC writes (see `mvccPutInternal`). It's fitting that behavior needed for `SELECT FOR UPDATE` would mirror that already exhibited by the read portion of read-write operations. This also hints at an opportunity to potentially use this option to merge the two implementations and get rid of custom logic like `mvccGetInternal` that only exists on the write path. We'd need to be careful about doing so though, as this code is heavily tuned. Release note: None --- c-deps/libroach/include/libroach.h | 5 +- c-deps/libroach/mvcc.cc | 14 +- c-deps/libroach/mvcc.h | 60 +++++-- pkg/roachpb/errors.go | 11 ++ pkg/storage/engine/mvcc.go | 98 ++++++---- pkg/storage/engine/mvcc_history_test.go | 10 +- pkg/storage/engine/pebble_mvcc_scanner.go | 57 ++++-- pkg/storage/engine/rocksdb.go | 38 +++- .../mvcc_histories/read_fail_on_more_recent | 168 ++++++++++++++++++ 9 files changed, 376 insertions(+), 85 deletions(-) create mode 100644 pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent diff --git a/c-deps/libroach/include/libroach.h b/c-deps/libroach/include/libroach.h index ecb7ac3ad3f5..18bb4ec8719c 100644 --- a/c-deps/libroach/include/libroach.h +++ b/c-deps/libroach/include/libroach.h @@ -358,15 +358,16 @@ typedef struct { DBStatus status; DBChunkedBuffer data; DBSlice intents; + DBTimestamp write_too_old_timestamp; DBTimestamp uncertainty_timestamp; DBSlice resume_key; } DBScanResults; DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn, - bool inconsistent, bool tombstones); + bool inconsistent, bool tombstones, bool fail_on_more_recent); DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys, DBTxn txn, bool inconsistent, bool reverse, - bool tombstones); + bool tombstones, bool fail_on_more_recent); // DBStatsResult contains various runtime stats for RocksDB. typedef struct { diff --git a/c-deps/libroach/mvcc.cc b/c-deps/libroach/mvcc.cc index c838387ff9f6..4518f250ea07 100644 --- a/c-deps/libroach/mvcc.cc +++ b/c-deps/libroach/mvcc.cc @@ -269,28 +269,28 @@ DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, int64_ } DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn, - bool inconsistent, bool tombstones) { + bool inconsistent, bool tombstones, bool fail_on_more_recent) { // Get is implemented as a scan where we retrieve a single key. We specify an // empty key for the end key which will ensure we don't retrieve a key // different than the start key. This is a bit of a hack. const DBSlice end = {0, 0}; ScopedStats scoped_iter(iter); mvccForwardScanner scanner(iter, key, end, timestamp, 1 /* max_keys */, txn, inconsistent, - tombstones); + tombstones, fail_on_more_recent); return scanner.get(); } DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys, DBTxn txn, bool inconsistent, bool reverse, - bool tombstones) { + bool tombstones, bool fail_on_more_recent) { ScopedStats scoped_iter(iter); if (reverse) { - mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent, - tombstones); + mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent, tombstones, + fail_on_more_recent); return scanner.scan(); } else { - mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent, - tombstones); + mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent, tombstones, + fail_on_more_recent); return scanner.scan(); } } diff --git a/c-deps/libroach/mvcc.h b/c-deps/libroach/mvcc.h index 3ba99cfa7e55..870393f47bea 100644 --- a/c-deps/libroach/mvcc.h +++ b/c-deps/libroach/mvcc.h @@ -48,7 +48,7 @@ static const int kMaxItersBeforeSeek = 10; template class mvccScanner { public: mvccScanner(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys, - DBTxn txn, bool inconsistent, bool tombstones) + DBTxn txn, bool inconsistent, bool tombstones, bool fail_on_more_recent) : iter_(iter), iter_rep_(iter->rep.get()), start_key_(ToSlice(start)), @@ -62,6 +62,7 @@ template class mvccScanner { txn_ignored_seqnums_(txn.ignored_seqnums), inconsistent_(inconsistent), tombstones_(tombstones), + fail_on_more_recent_(fail_on_more_recent), check_uncertainty_(timestamp < txn.max_timestamp), kvs_(new chunkedBuffer), intents_(new rocksdb::WriteBatch), @@ -254,6 +255,13 @@ template class mvccScanner { return true; } + bool writeTooOldError(DBTimestamp ts) { + results_.write_too_old_timestamp = ts; + kvs_->Clear(); + intents_->Clear(); + return false; + } + bool uncertaintyError(DBTimestamp ts) { results_.uncertainty_timestamp = ts; kvs_->Clear(); @@ -276,8 +284,15 @@ template class mvccScanner { return addAndAdvance(cur_value_); } + if (fail_on_more_recent_) { + // 2. Our txn's read timestamp is less than the most recent + // version's timestamp and the scanner has been configured + // to throw a write too old error on more recent versions. + return writeTooOldError(cur_timestamp_); + } + if (check_uncertainty_) { - // 2. Our txn's read timestamp is less than the max timestamp + // 3. Our txn's read timestamp is less than the max timestamp // seen by the txn. We need to check for clock uncertainty // errors. if (txn_max_timestamp_ >= cur_timestamp_) { @@ -288,7 +303,7 @@ template class mvccScanner { return seekVersion(txn_max_timestamp_, true); } - // 3. Our txn's read timestamp is greater than or equal to the + // 4. Our txn's read timestamp is greater than or equal to the // max timestamp seen by the txn so clock uncertainty checks are // unnecessary. We need to seek to the desired version of the // value (i.e. one with a timestamp earlier than our read @@ -305,7 +320,7 @@ template class mvccScanner { } if (meta_.has_raw_bytes()) { - // 4. Emit immediately if the value is inline. + // 5. Emit immediately if the value is inline. return addAndAdvance(meta_.raw_bytes()); } @@ -326,8 +341,12 @@ template class mvccScanner { // Intents for other transactions are visible at or below: // max(txn.max_timestamp, read_timestamp) const DBTimestamp max_visible_timestamp = check_uncertainty_ ? txn_max_timestamp_ : timestamp_; - if (max_visible_timestamp < meta_timestamp && !own_intent) { - // 5. The key contains an intent, but we're reading before the + // ... unless we're intending on failing on more recent writes, + // in which case other transaction's intents are always visible. + const bool other_intent_visible = + max_visible_timestamp >= meta_timestamp || fail_on_more_recent_; + if (!own_intent && !other_intent_visible) { + // 6. The key contains an intent, but we're reading before the // intent. Seek to the desired version. Note that if we own the // intent (i.e. we're reading transactionally) we want to read // the intent regardless of our read timestamp and fall into @@ -336,7 +355,7 @@ template class mvccScanner { } if (inconsistent_) { - // 6. The key contains an intent and we're doing an inconsistent + // 7. The key contains an intent and we're doing an inconsistent // read at a timestamp newer than the intent. We ignore the // intent by insisting that the timestamp we're reading at is a // historical timestamp < the intent timestamp. However, we @@ -354,24 +373,30 @@ template class mvccScanner { } if (!own_intent) { - // 7. The key contains an intent which was not written by our - // transaction and our read timestamp is newer than that of the - // intent. Note that this will trigger an error on the Go - // side. We continue scanning so that we can return all of the - // intents in the scan range. + // 8. The key contains an intent which was not written by our + // transaction and either: + // - our read timestamp is equal to or newer than that of the + // intent + // - our read timestamp is older than that of the intent but + // the intent is in our transaction's uncertainty interval + // - our read timestamp is older than that of the intent but + // we want to fail on more recent writes + // Note that this will trigger an error on the Go side. We + // continue scanning so that we can return all of the intents + // in the scan range. intents_->Put(cur_raw_key_, cur_value_); return advanceKey(); } if (txn_epoch_ == meta_.txn().epoch()) { if (txn_sequence_ >= meta_.txn().sequence() && !seqNumIsIgnored(meta_.txn().sequence())) { - // 8. We're reading our own txn's intent at an equal or higher sequence. + // 9. We're reading our own txn's intent at an equal or higher sequence. // Note that we read at the intent timestamp, not at our read timestamp // as the intent timestamp may have been pushed forward by another // transaction. Txn's always need to read their own writes. return seekVersion(meta_timestamp, false); } else { - // 9. We're reading our own txn's intent at a lower sequence than is + // 10. We're reading our own txn's intent at a lower sequence than is // currently present in the intent. This means the intent we're seeing // was written at a higher sequence than the read and that there may or // may not be earlier versions of the intent (with lower sequence @@ -382,7 +407,7 @@ template class mvccScanner { if (found) { return advanceKey(); } - // 10. If no value in the intent history has a sequence number equal to + // 11. If no value in the intent history has a sequence number equal to // or less than the read, we must ignore the intents laid down by the // transaction all together. We ignore the intent by insisting that the // timestamp we're reading at is a historical timestamp < the intent @@ -392,7 +417,7 @@ template class mvccScanner { } if (txn_epoch_ < meta_.txn().epoch()) { - // 11. We're reading our own txn's intent but the current txn has + // 12. We're reading our own txn's intent but the current txn has // an earlier epoch than the intent. Return an error so that the // earlier incarnation of our transaction aborts (presumably // this is some operation that was retried). @@ -400,7 +425,7 @@ template class mvccScanner { txn_epoch_, meta_.txn().epoch())); } - // 12. We're reading our own txn's intent but the current txn has a + // 13. We're reading our own txn's intent but the current txn has a // later epoch than the intent. This can happen if the txn was // restarted and an earlier iteration wrote the value we're now // reading. In this case, we ignore the intent and read the @@ -729,6 +754,7 @@ template class mvccScanner { const DBIgnoredSeqNums txn_ignored_seqnums_; const bool inconsistent_; const bool tombstones_; + const bool fail_on_more_recent_; const bool check_uncertainty_; DBScanResults results_; std::unique_ptr kvs_; diff --git a/pkg/roachpb/errors.go b/pkg/roachpb/errors.go index eb43fd94f4ac..22b6fd3e0163 100644 --- a/pkg/roachpb/errors.go +++ b/pkg/roachpb/errors.go @@ -588,6 +588,17 @@ func (e *WriteIntentError) message(_ *Error) string { var _ ErrorDetailInterface = &WriteIntentError{} +// NewWriteTooOldError creates a new write too old error. The function accepts +// the timestamp of the operation that hit the error, along with the timestamp +// immediately after the existing write which had a higher timestamp and which +// caused the error. +func NewWriteTooOldError(operationTS, actualTS hlc.Timestamp) *WriteTooOldError { + return &WriteTooOldError{ + Timestamp: operationTS, + ActualTimestamp: actualTS, + } +} + func (e *WriteTooOldError) Error() string { return e.message(nil) } diff --git a/pkg/storage/engine/mvcc.go b/pkg/storage/engine/mvcc.go index f419cd7cda3c..971fab2056a8 100644 --- a/pkg/storage/engine/mvcc.go +++ b/pkg/storage/engine/mvcc.go @@ -707,9 +707,20 @@ func (b *getBuffer) release() { // MVCCGetOptions bundles options for the MVCCGet family of functions. type MVCCGetOptions struct { // See the documentation for MVCCGet for information on these parameters. - Inconsistent bool - Tombstones bool - Txn *roachpb.Transaction + Inconsistent bool + Tombstones bool + FailOnMoreRecent bool + Txn *roachpb.Transaction +} + +func (opts *MVCCGetOptions) validate() error { + if opts.Inconsistent && opts.Txn != nil { + return errors.Errorf("cannot allow inconsistent reads within a transaction") + } + if opts.Inconsistent && opts.FailOnMoreRecent { + return errors.Errorf("cannot allow inconsistent reads with fail on more recent option") + } + return nil } // MVCCGet returns the most recent value for the specified key whose timestamp @@ -727,6 +738,12 @@ type MVCCGetOptions struct { // // Note that transactional gets must be consistent. Put another way, only // non-transactional gets may be inconsistent. +// +// When reading in "fail on more recent" mode, a WriteTooOldError will be +// returned if the read observes a version with a timestamp above the read +// timestamp. Similarly, a WriteIntentError will be returned if the read +// observes another transaction's intent, even if it has a timestamp above +// the read timestamp. func MVCCGet( ctx context.Context, reader Reader, key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions, ) (*roachpb.Value, *roachpb.Intent, error) { @@ -738,14 +755,14 @@ func MVCCGet( func mvccGet( ctx context.Context, iter Iterator, key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions, ) (value *roachpb.Value, intent *roachpb.Intent, err error) { + if len(key) == 0 { + return nil, nil, emptyKeyError() + } if timestamp.WallTime < 0 { return nil, nil, errors.Errorf("cannot write to %q at timestamp %s", key, timestamp) } - if opts.Inconsistent && opts.Txn != nil { - return nil, nil, errors.Errorf("cannot allow inconsistent reads within a transaction") - } - if len(key) == 0 { - return nil, nil, emptyKeyError() + if err := opts.validate(); err != nil { + return nil, nil, err } // If the iterator has a specialized implementation, defer to that. @@ -760,12 +777,13 @@ func mvccGet( // specify an empty key for the end key which will ensure we don't retrieve a // key different than the start key. This is a bit of a hack. *mvccScanner = pebbleMVCCScanner{ - parent: iter, - start: key, - ts: timestamp, - maxKeys: 1, - inconsistent: opts.Inconsistent, - tombstones: opts.Tombstones, + parent: iter, + start: key, + ts: timestamp, + maxKeys: 1, + inconsistent: opts.Inconsistent, + tombstones: opts.Tombstones, + failOnMoreRecent: opts.FailOnMoreRecent, } mvccScanner.init(opts.Txn) @@ -1662,9 +1680,7 @@ func mvccPutInternal( // instead of allowing their transactions to continue and be retried // before committing. writeTimestamp.Forward(metaTimestamp.Next()) - maybeTooOldErr = &roachpb.WriteTooOldError{ - Timestamp: readTimestamp, ActualTimestamp: writeTimestamp, - } + maybeTooOldErr = roachpb.NewWriteTooOldError(readTimestamp, writeTimestamp) // If we're in a transaction, always get the value at the orig // timestamp. if txn != nil { @@ -2290,12 +2306,12 @@ func mvccScanToBytes( timestamp hlc.Timestamp, opts MVCCScanOptions, ) (kvData [][]byte, numKVs int64, resumeSpan *roachpb.Span, intents []roachpb.Intent, err error) { - if opts.Inconsistent && opts.Txn != nil { - return nil, 0, nil, nil, errors.Errorf("cannot allow inconsistent reads within a transaction") - } if len(endKey) == 0 { return nil, 0, nil, nil, emptyKeyError() } + if err := opts.validate(); err != nil { + return nil, 0, nil, nil, err + } if max == 0 { resumeSpan = &roachpb.Span{Key: key, EndKey: endKey} return nil, 0, resumeSpan, nil, nil @@ -2310,14 +2326,15 @@ func mvccScanToBytes( defer pebbleMVCCScannerPool.Put(mvccScanner) *mvccScanner = pebbleMVCCScanner{ - parent: iter, - reverse: opts.Reverse, - start: key, - end: endKey, - ts: timestamp, - maxKeys: max, - inconsistent: opts.Inconsistent, - tombstones: opts.Tombstones, + parent: iter, + reverse: opts.Reverse, + start: key, + end: endKey, + ts: timestamp, + maxKeys: max, + inconsistent: opts.Inconsistent, + tombstones: opts.Tombstones, + failOnMoreRecent: opts.FailOnMoreRecent, } mvccScanner.init(opts.Txn) @@ -2411,10 +2428,21 @@ type MVCCScanOptions struct { // to return no results. // See the documentation for MVCCScan for information on these parameters. - Inconsistent bool - Tombstones bool - Reverse bool - Txn *roachpb.Transaction + Inconsistent bool + Tombstones bool + Reverse bool + FailOnMoreRecent bool + Txn *roachpb.Transaction +} + +func (opts *MVCCScanOptions) validate() error { + if opts.Inconsistent && opts.Txn != nil { + return errors.Errorf("cannot allow inconsistent reads within a transaction") + } + if opts.Inconsistent && opts.FailOnMoreRecent { + return errors.Errorf("cannot allow inconsistent reads with fail on more recent option") + } + return nil } // MVCCScan scans the key range [key, endKey) in the provided reader up to some @@ -2448,6 +2476,12 @@ type MVCCScanOptions struct { // // Note that transactional scans must be consistent. Put another way, only // non-transactional scans may be inconsistent. +// +// When scanning in "fail on more recent" mode, a WriteTooOldError will be +// returned if the scan observes a version with a timestamp above the read +// timestamp. Similarly, a WriteIntentError will be returned if the scan +// observes another transaction's intent, even if it has a timestamp above +// the read timestamp. func MVCCScan( ctx context.Context, reader Reader, diff --git a/pkg/storage/engine/mvcc_history_test.go b/pkg/storage/engine/mvcc_history_test.go index fd4dee347eb0..3570d1c986ab 100644 --- a/pkg/storage/engine/mvcc_history_test.go +++ b/pkg/storage/engine/mvcc_history_test.go @@ -48,10 +48,10 @@ import ( // // cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] // del [t=] [ts=[,]] [resolve [status=]] k= -// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] +// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] // increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] // put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] -// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] +// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] // // merge [ts=[,]] k= v= [raw] // @@ -598,6 +598,9 @@ func cmdGet(e *evalCtx) error { if e.hasArg("tombstones") { opts.Tombstones = true } + if e.hasArg("failOnMoreRecent") { + opts.FailOnMoreRecent = true + } val, intent, err := MVCCGet(e.ctx, e.engine, key, ts, opts) if err != nil { return err @@ -691,6 +694,9 @@ func cmdScan(e *evalCtx) error { if e.hasArg("reverse") { opts.Reverse = true } + if e.hasArg("failOnMoreRecent") { + opts.FailOnMoreRecent = true + } max := int64(-1) if e.hasArg("max") { var imax int diff --git a/pkg/storage/engine/pebble_mvcc_scanner.go b/pkg/storage/engine/pebble_mvcc_scanner.go index 0c4cdb207fb4..98678270b065 100644 --- a/pkg/storage/engine/pebble_mvcc_scanner.go +++ b/pkg/storage/engine/pebble_mvcc_scanner.go @@ -108,6 +108,7 @@ type pebbleMVCCScanner struct { // Bools copied over from MVCC{Scan,Get}Options. See the comment on the // package level MVCCScan for what these mean. inconsistent, tombstones bool + failOnMoreRecent bool checkUncertainty bool keyBuf []byte savedBuf []byte @@ -239,6 +240,16 @@ func (p *pebbleMVCCScanner) getFromIntentHistory() bool { return true } +// Returns a write too old error with the specified timestamp. +func (p *pebbleMVCCScanner) writeTooOldError(ts hlc.Timestamp) bool { + // The txn can't write at the existing timestamp, so we provide the error + // with the timestamp immediately after it. + p.err = roachpb.NewWriteTooOldError(p.ts, ts.Next()) + p.results.clear() + p.intents.Reset() + return false +} + // Returns an uncertainty error with the specified timestamp and p.txn. func (p *pebbleMVCCScanner) uncertaintyError(ts hlc.Timestamp) bool { p.err = roachpb.NewReadWithinUncertaintyIntervalError(p.ts, ts, p.txn) @@ -258,8 +269,15 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { return p.addAndAdvance(p.curValue) } + if p.failOnMoreRecent { + // 2. Our txn's read timestamp is less than the most recent + // version's timestamp and the scanner has been configured + // to throw a write too old error on more recent versions. + return p.writeTooOldError(p.curTS) + } + if p.checkUncertainty { - // 2. Our txn's read timestamp is less than the max timestamp + // 3. Our txn's read timestamp is less than the max timestamp // seen by the txn. We need to check for clock uncertainty // errors. if p.curTS.LessEq(p.txn.MaxTimestamp) { @@ -269,7 +287,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { return p.seekVersion(p.txn.MaxTimestamp, true) } - // 3. Our txn's read timestamp is greater than or equal to the + // 4. Our txn's read timestamp is greater than or equal to the // max timestamp seen by the txn so clock uncertainty checks are // unnecessary. We need to seek to the desired version of the // value (i.e. one with a timestamp earlier than our read @@ -287,7 +305,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { return false } if len(p.meta.RawBytes) != 0 { - // 4. Emit immediately if the value is inline. + // 5. Emit immediately if the value is inline. return p.addAndAdvance(p.meta.RawBytes) } @@ -313,9 +331,10 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { if p.checkUncertainty { maxVisibleTS = p.txn.MaxTimestamp } + otherIntentVisible := metaTS.LessEq(maxVisibleTS) || p.failOnMoreRecent - if maxVisibleTS.Less(metaTS) && !ownIntent { - // 5. The key contains an intent, but we're reading before the + if !ownIntent && !otherIntentVisible { + // 6. The key contains an intent, but we're reading before the // intent. Seek to the desired version. Note that if we own the // intent (i.e. we're reading transactionally) we want to read // the intent regardless of our read timestamp and fall into @@ -324,7 +343,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { } if p.inconsistent { - // 6. The key contains an intent and we're doing an inconsistent + // 7. The key contains an intent and we're doing an inconsistent // read at a timestamp newer than the intent. We ignore the // intent by insisting that the timestamp we're reading at is a // historical timestamp < the intent timestamp. However, we @@ -347,11 +366,17 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { } if !ownIntent { - // 7. The key contains an intent which was not written by our - // transaction and our read timestamp is newer than that of the - // intent. Note that this will trigger an error on the Go - // side. We continue scanning so that we can return all of the - // intents in the scan range. + // 8. The key contains an intent which was not written by our + // transaction and either: + // - our read timestamp is equal to or newer than that of the + // intent + // - our read timestamp is older than that of the intent but + // the intent is in our transaction's uncertainty interval + // - our read timestamp is older than that of the intent but + // we want to fail on more recent writes + // Note that this will trigger an error higher up the stack. We + // continue scanning so that we can return all of the intents + // in the scan range. p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], p.curMVCCKey()) p.err = p.intents.Set(p.keyBuf, p.curValue, nil) if p.err != nil { @@ -362,14 +387,14 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { if p.txnEpoch == p.meta.Txn.Epoch { if p.txnSequence >= p.meta.Txn.Sequence && !enginepb.TxnSeqIsIgnored(p.meta.Txn.Sequence, p.txnIgnoredSeqNums) { - // 8. We're reading our own txn's intent at an equal or higher sequence. + // 9. We're reading our own txn's intent at an equal or higher sequence. // Note that we read at the intent timestamp, not at our read timestamp // as the intent timestamp may have been pushed forward by another // transaction. Txn's always need to read their own writes. return p.seekVersion(metaTS, false) } - // 9. We're reading our own txn's intent at a lower sequence than is + // 10. We're reading our own txn's intent at a lower sequence than is // currently present in the intent. This means the intent we're seeing // was written at a higher sequence than the read and that there may or // may not be earlier versions of the intent (with lower sequence @@ -382,7 +407,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { } return p.advanceKey() } - // 10. If no value in the intent history has a sequence number equal to + // 11. If no value in the intent history has a sequence number equal to // or less than the read, we must ignore the intents laid down by the // transaction all together. We ignore the intent by insisting that the // timestamp we're reading at is a historical timestamp < the intent @@ -391,7 +416,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { } if p.txnEpoch < p.meta.Txn.Epoch { - // 11. We're reading our own txn's intent but the current txn has + // 12. We're reading our own txn's intent but the current txn has // an earlier epoch than the intent. Return an error so that the // earlier incarnation of our transaction aborts (presumably // this is some operation that was retried). @@ -400,7 +425,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { return false } - // 12. We're reading our own txn's intent but the current txn has a + // 13. We're reading our own txn's intent but the current txn has a // later epoch than the intent. This can happen if the txn was // restarted and an earlier iteration wrote the value we're now // reading. In this case, we ignore the intent and read the diff --git a/pkg/storage/engine/rocksdb.go b/pkg/storage/engine/rocksdb.go index 87266e0f1093..c4bf78ad71af 100644 --- a/pkg/storage/engine/rocksdb.go +++ b/pkg/storage/engine/rocksdb.go @@ -2415,12 +2415,15 @@ func (r *rocksDBIterator) MVCCGet( r.clearState() state := C.MVCCGet( r.iter, goToCSlice(key), goToCTimestamp(timestamp), goToCTxn(opts.Txn), - C.bool(opts.Inconsistent), C.bool(opts.Tombstones), + C.bool(opts.Inconsistent), C.bool(opts.Tombstones), C.bool(opts.FailOnMoreRecent), ) if err := statusToError(state.status); err != nil { return nil, nil, err } + if err := writeTooOldToError(timestamp, state.write_too_old_timestamp); err != nil { + return nil, nil, err + } if err := uncertaintyToError(timestamp, state.uncertainty_timestamp, opts.Txn); err != nil { return nil, nil, err } @@ -2484,11 +2487,15 @@ func (r *rocksDBIterator) MVCCScan( goToCTimestamp(timestamp), C.int64_t(max), goToCTxn(opts.Txn), C.bool(opts.Inconsistent), C.bool(opts.Reverse), C.bool(opts.Tombstones), + C.bool(opts.FailOnMoreRecent), ) if err := statusToError(state.status); err != nil { return nil, 0, nil, nil, err } + if err := writeTooOldToError(timestamp, state.write_too_old_timestamp); err != nil { + return nil, 0, nil, nil, err + } if err := uncertaintyToError(timestamp, state.uncertainty_timestamp, opts.Txn); err != nil { return nil, 0, nil, nil, err } @@ -2715,6 +2722,13 @@ func goToCTimestamp(ts hlc.Timestamp) C.DBTimestamp { } } +func cToGoTimestamp(ts C.DBTimestamp) hlc.Timestamp { + return hlc.Timestamp{ + WallTime: int64(ts.wall_time), + Logical: int32(ts.logical), + } +} + func goToCTxn(txn *roachpb.Transaction) C.DBTxn { var r C.DBTxn if txn != nil { @@ -2745,16 +2759,22 @@ func statusToError(s C.DBStatus) error { return &Error{msg: cStringToGoString(s)} } +func writeTooOldToError(readTS hlc.Timestamp, existingCTS C.DBTimestamp) error { + existingTS := cToGoTimestamp(existingCTS) + if !existingTS.IsEmpty() { + // The txn can't write at the existing timestamp, so we provide the + // error with the timestamp immediately after it. + return roachpb.NewWriteTooOldError(readTS, existingTS.Next()) + } + return nil +} + func uncertaintyToError( - readTS hlc.Timestamp, existingTS C.DBTimestamp, txn *roachpb.Transaction, + readTS hlc.Timestamp, existingCTS C.DBTimestamp, txn *roachpb.Transaction, ) error { - if existingTS.wall_time != 0 || existingTS.logical != 0 { - return roachpb.NewReadWithinUncertaintyIntervalError( - readTS, hlc.Timestamp{ - WallTime: int64(existingTS.wall_time), - Logical: int32(existingTS.logical), - }, - txn) + existingTS := cToGoTimestamp(existingCTS) + if !existingTS.IsEmpty() { + return roachpb.NewReadWithinUncertaintyIntervalError(readTS, existingTS, txn) } return nil } diff --git a/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent b/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent new file mode 100644 index 000000000000..3bbeec50dd5e --- /dev/null +++ b/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent @@ -0,0 +1,168 @@ +# Setup: +# k1: value @ ts 10 +# k2: intent @ ts 10 + +run ok +put k=k1 v=v ts=10,0 +---- +>> at end: +data: "k1"/0.000000010,0 -> /BYTES/v + +run ok +with t=A + txn_begin ts=10,0 + put k=k2 v=v +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=0.000000010,0 min=0,0 seq=0} rw=true stat=PENDING rts=0.000000010,0 wto=false max=0,0 +data: "k1"/0.000000010,0 -> /BYTES/v +meta: "k2"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=0.000000010,0 min=0,0 seq=0} ts=0.000000010,0 del=false klen=12 vlen=6 +data: "k2"/0.000000010,0 -> /BYTES/v + +# Test cases: +# +# for k in (k1, k2): +# for op in (get, scan): +# for ts in (9, 10, 11): +# for failOnMoreRecent in (false, true): +# testCase() +# + +run ok +get k=k1 ts=9,0 +---- +get: "k1" -> + +run error +get k=k1 ts=9,0 failOnMoreRecent +---- +error: (*roachpb.WriteTooOldError:) WriteTooOldError: write at timestamp 0.000000009,0 too old; wrote at 0.000000010,1 + +run ok +get k=k1 ts=10,0 +---- +get: "k1" -> /BYTES/v @0.000000010,0 + +run ok +get k=k1 ts=10,0 failOnMoreRecent +---- +get: "k1" -> /BYTES/v @0.000000010,0 + +run ok +get k=k1 ts=11,0 +---- +get: "k1" -> /BYTES/v @0.000000010,0 + +run ok +get k=k1 ts=11,0 failOnMoreRecent +---- +get: "k1" -> /BYTES/v @0.000000010,0 + +run ok +scan k=k1 end=k2 ts=9,0 +---- +scan: "k1"-"k2" -> + +run error +scan k=k1 end=k2 ts=9,0 failOnMoreRecent +---- +scan: "k1"-"k2" -> +error: (*roachpb.WriteTooOldError:) WriteTooOldError: write at timestamp 0.000000009,0 too old; wrote at 0.000000010,1 + +run ok +scan k=k1 end=k2 ts=10,0 +---- +scan: "k1" -> /BYTES/v @0.000000010,0 + +run ok +scan k=k1 end=k2 ts=10,0 failOnMoreRecent +---- +scan: "k1" -> /BYTES/v @0.000000010,0 + +run ok +scan k=k1 end=k2 ts=11,0 +---- +scan: "k1" -> /BYTES/v @0.000000010,0 + +run ok +scan k=k1 end=k2 ts=11,0 failOnMoreRecent +---- +scan: "k1" -> /BYTES/v @0.000000010,0 + +run ok +get k=k2 ts=9,0 +---- +get: "k2" -> + +run error +get k=k2 ts=9,0 failOnMoreRecent +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +get k=k2 ts=10,0 +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +get k=k2 ts=10,0 failOnMoreRecent +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +get k=k2 ts=11,0 +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +get k=k2 ts=11,0 failOnMoreRecent +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run ok +scan k=k2 end=k3 ts=9,0 +---- +scan: "k2"-"k3" -> + +run error +scan k=k2 end=k3 ts=9,0 failOnMoreRecent +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +scan k=k2 end=k3 ts=10,0 +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +scan k=k2 end=k3 ts=10,0 failOnMoreRecent +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +scan k=k2 end=k3 ts=11,0 +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +scan k=k2 end=k3 ts=11,0 failOnMoreRecent +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +# The failOnMoreRecent and inconsistent options cannot be used together. + +run error +get k=k1 ts=9,0 inconsistent failOnMoreRecent +---- +error: (*withstack.withStack:) cannot allow inconsistent reads with fail on more recent option + +run error +scan k=k1 end=k2 ts=9,0 inconsistent failOnMoreRecent +---- +scan: "k1"-"k2" -> +error: (*withstack.withStack:) cannot allow inconsistent reads with fail on more recent option From 775986821fa21d93d7bd1a7b71883fd908f91c2b Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Thu, 30 Jan 2020 23:57:49 -0500 Subject: [PATCH 3/3] storage/engine: output result even on cmdGet error cmdScan was already doing this, so we should be consistent or it becomes confusing why scan results show up on errors cases but not get results. --- pkg/storage/engine/mvcc_history_test.go | 8 ++++---- pkg/storage/engine/testdata/mvcc_histories/empty_key | 1 + .../engine/testdata/mvcc_histories/get_negative_timestamp | 1 + .../testdata/mvcc_histories/read_fail_on_more_recent | 7 +++++++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pkg/storage/engine/mvcc_history_test.go b/pkg/storage/engine/mvcc_history_test.go index 3570d1c986ab..4eeffa1a3908 100644 --- a/pkg/storage/engine/mvcc_history_test.go +++ b/pkg/storage/engine/mvcc_history_test.go @@ -602,9 +602,9 @@ func cmdGet(e *evalCtx) error { opts.FailOnMoreRecent = true } val, intent, err := MVCCGet(e.ctx, e.engine, key, ts, opts) - if err != nil { - return err - } + // NB: the error is returned below. This ensures the test can + // ascertain no result is populated in the intent when an error + // occurs. if intent != nil { fmt.Fprintf(e.results.buf, "get: %v -> intent {%s} %s\n", key, intent.Txn, intent.Status) } @@ -613,7 +613,7 @@ func cmdGet(e *evalCtx) error { } else { fmt.Fprintf(e.results.buf, "get: %v -> \n", key) } - return nil + return err } func cmdIncrement(e *evalCtx) error { diff --git a/pkg/storage/engine/testdata/mvcc_histories/empty_key b/pkg/storage/engine/testdata/mvcc_histories/empty_key index 6e4f937b85b1..08a7960ffe49 100644 --- a/pkg/storage/engine/testdata/mvcc_histories/empty_key +++ b/pkg/storage/engine/testdata/mvcc_histories/empty_key @@ -1,6 +1,7 @@ run error get ts=0,1 k= ---- +get: /Min -> error: (*errors.fundamental:) attempted access to empty key run error diff --git a/pkg/storage/engine/testdata/mvcc_histories/get_negative_timestamp b/pkg/storage/engine/testdata/mvcc_histories/get_negative_timestamp index 268e039dea2d..ffcd2ce7744b 100644 --- a/pkg/storage/engine/testdata/mvcc_histories/get_negative_timestamp +++ b/pkg/storage/engine/testdata/mvcc_histories/get_negative_timestamp @@ -7,6 +7,7 @@ data: "k"/0.000000001,0 -> /BYTES/v run error get k=k ts=-1 ---- +get: "k" -> error: (*withstack.withStack:) cannot write to "k" at timestamp -0.000000001,0 diff --git a/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent b/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent index 3bbeec50dd5e..2a7b73bb956d 100644 --- a/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent +++ b/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent @@ -36,6 +36,7 @@ get: "k1" -> run error get k=k1 ts=9,0 failOnMoreRecent ---- +get: "k1" -> error: (*roachpb.WriteTooOldError:) WriteTooOldError: write at timestamp 0.000000009,0 too old; wrote at 0.000000010,1 run ok @@ -97,26 +98,31 @@ get: "k2" -> run error get k=k2 ts=9,0 failOnMoreRecent ---- +get: "k2" -> error: (*roachpb.WriteIntentError:) conflicting intents on "k2" run error get k=k2 ts=10,0 ---- +get: "k2" -> error: (*roachpb.WriteIntentError:) conflicting intents on "k2" run error get k=k2 ts=10,0 failOnMoreRecent ---- +get: "k2" -> error: (*roachpb.WriteIntentError:) conflicting intents on "k2" run error get k=k2 ts=11,0 ---- +get: "k2" -> error: (*roachpb.WriteIntentError:) conflicting intents on "k2" run error get k=k2 ts=11,0 failOnMoreRecent ---- +get: "k2" -> error: (*roachpb.WriteIntentError:) conflicting intents on "k2" run ok @@ -159,6 +165,7 @@ error: (*roachpb.WriteIntentError:) conflicting intents on "k2" run error get k=k1 ts=9,0 inconsistent failOnMoreRecent ---- +get: "k1" -> error: (*withstack.withStack:) cannot allow inconsistent reads with fail on more recent option run error