Skip to content

Commit

Permalink
KVStore: pick flexible region format to release-7.5 (#8643)
Browse files Browse the repository at this point in the history
close #8647
  • Loading branch information
CalvinNeo authored Jan 3, 2024
1 parent 4c122ae commit 452bf83
Show file tree
Hide file tree
Showing 9 changed files with 519 additions and 116 deletions.
1 change: 0 additions & 1 deletion dbms/src/Common/FailPoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ namespace DB
M(force_fail_to_create_etcd_session) \
M(force_remote_read_for_batch_cop_once) \
M(exception_new_dynamic_thread) \
M(force_region_persist_version) \
M(force_wait_index_timeout)

#define APPLY_FOR_FAILPOINTS(M) \
Expand Down
18 changes: 13 additions & 5 deletions dbms/src/Storages/KVStore/MultiRaft/PrehandleSnapshot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,13 @@ static inline std::tuple<ReadFromStreamResult, PrehandleResult> executeTransform
auto flag = std::any_cast<std::shared_ptr<std::atomic_uint64_t>>(v.value());
if (flag->load() == 1)
{
LOG_INFO(log, "Throw fake exception once");
flag->store(0);
throw Exception("fake exception once", ErrorCodes::REGION_DATA_SCHEMA_UPDATED);
}
else if (flag->load() == 2)
{
LOG_INFO(log, "Throw fake exception always");
throw Exception("fake exception", ErrorCodes::REGION_DATA_SCHEMA_UPDATED);
}
}
Expand Down Expand Up @@ -367,7 +369,7 @@ static void runInParallel(
= executeTransform(log, part_new_region, prehandle_task, job_type, dm_storage, part_sst_stream, opt, tmt);
LOG_INFO(
log,
"Finished extra parallel prehandle task limit {} write cf {} lock cf {} default cf {} dmfiles {} error {}, "
"Finished extra parallel prehandle task limit {} write_cf={} lock_cf={} default_cf={} dmfiles={} error={}, "
"split_id={} region_id={}",
limit_tag,
part_prehandle_result.stats.write_cf_keys,
Expand Down Expand Up @@ -474,8 +476,8 @@ void executeParallelTransform(
= executeTransform(log, new_region, prehandle_task, job_type, storage, sst_stream, opt, tmt);
LOG_INFO(
log,
"Finished extra parallel prehandle task limit {} write cf {} lock cf {} default cf {} dmfiles {} "
"error {}, split_id={}, "
"Finished extra parallel prehandle task limit={} write_cf {} lock_cf={} default_cf={} dmfiles={} "
"error={}, split_id={}, "
"region_id={}",
sst_stream->getSoftLimit()->toDebugString(),
head_prehandle_result.stats.write_cf_keys,
Expand Down Expand Up @@ -518,6 +520,7 @@ void executeParallelTransform(
{
// Once a prehandle has non-ok result, we quit further loop
result = ctx->gather_res[extra_id];
result.extra_msg = fmt::format(", from {}", extra_id);
break;
}
}
Expand All @@ -533,6 +536,7 @@ void executeParallelTransform(
{
// Otherwise, fallback to error handling or exception handling.
result = head_result;
result.extra_msg = fmt::format(", from {}", DM::SSTScanSoftLimit::HEAD_OR_ONLY_SPLIT);
}
}

Expand Down Expand Up @@ -585,7 +589,7 @@ PrehandleResult KVStore::preHandleSSTsToDTFiles(
if (unlikely(storage == nullptr))
{
// The storage must be physically dropped, throw exception and do cleanup.
throw Exception("", ErrorCodes::TABLE_IS_DROPPED);
throw Exception(ErrorCodes::TABLE_IS_DROPPED, "Can't get table");
}

// Get a gc safe point for compacting
Expand Down Expand Up @@ -660,7 +664,9 @@ PrehandleResult KVStore::preHandleSSTsToDTFiles(
if (force_decode)
{
// Can not decode data with `force_decode == true`, must be something wrong
throw Exception(result.extra_msg, ErrorCodes::REGION_DATA_SCHEMA_UPDATED);
throw Exception(
ErrorCodes::REGION_DATA_SCHEMA_UPDATED,
fmt::format("Force decode failed {}", result.extra_msg));
}

// Update schema and try to decode again
Expand All @@ -673,6 +679,8 @@ PrehandleResult KVStore::preHandleSSTsToDTFiles(
tmt.getSchemaSyncerManager()->syncTableSchema(context, keyspace_id, physical_table_id);
// Next time should force_decode
force_decode = true;
prehandle_result = PrehandleResult{};
prehandle_task->abort_flag.store(false);

continue;
}
Expand Down
168 changes: 168 additions & 0 deletions dbms/src/Storages/KVStore/MultiRaft/RegionSerde.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <Common/FailPoint.h>
#include <Storages/KVStore/Region.h>
#include <Storages/KVStore/Utils/SerializationHelper.h>

#include <memory>
#include <utility>

namespace DB
{

constexpr UInt32 Region::CURRENT_VERSION = static_cast<UInt32>(RegionPersistVersion::V2);

std::pair<MaybeRegionPersistExtension, UInt32> getPersistExtensionTypeAndLength(ReadBuffer & buf)
{
auto ext_type = readBinary2<MaybeRegionPersistExtension>(buf);
auto size = readBinary2<UInt32>(buf);
// Note `ext_type` may not valid in RegionPersistExtension
return std::make_pair(ext_type, size);
}

size_t Region::writePersistExtension(
UInt32 & cnt,
WriteBuffer & wb,
MaybeRegionPersistExtension ext_type,
const char * data,
UInt32 size)
{
auto total_size = writeBinary2(ext_type, wb);
total_size += writeBinary2(size, wb);
wb.write(data, size);
total_size += size;
cnt++;
return total_size;
}

std::tuple<size_t, UInt64> Region::serialize(WriteBuffer & buf) const
{
return serializeImpl(
Region::CURRENT_VERSION,
0,
[](UInt32 &, WriteBuffer &) { return 0; },
buf);
}

std::tuple<size_t, UInt64> Region::serializeImpl(
UInt32 binary_version,
UInt32 expected_extension_count,
std::function<size_t(UInt32 &, WriteBuffer &)> extra_handler,
WriteBuffer & buf) const
{
size_t total_size = writeBinary2(binary_version, buf);

std::shared_lock<std::shared_mutex> lock(mutex);

// Serialize meta
const auto [meta_size, applied_index] = meta.serialize(buf);
total_size += meta_size;

// Try serialize extra flags
if (binary_version >= 2)
{
static_assert(sizeof(eager_truncated_index) == sizeof(UInt64));
// The upper 31 bits are used to store the length of extensions, and the lowest bit is flag of eager gc.
UInt32 flags = (expected_extension_count << 1) | RegionPersistFormat::HAS_EAGER_TRUNCATE_INDEX;
total_size += writeBinary2(flags, buf);
total_size += writeBinary2(eager_truncated_index, buf);
}

UInt32 actual_extension_count = 0;
total_size += extra_handler(actual_extension_count, buf);
RUNTIME_CHECK(expected_extension_count == actual_extension_count, expected_extension_count, actual_extension_count);

// serialize data
total_size += data.serialize(buf);

return {total_size, applied_index};
}

RegionPtr Region::deserialize(ReadBuffer & buf, const TiFlashRaftProxyHelper * proxy_helper)
{
return Region::deserializeImpl(
Region::CURRENT_VERSION,
[](UInt32, ReadBuffer &, UInt32) { return false; },
buf,
proxy_helper);
}

/// Currently supports:
/// 1. Vx -> Vy where x >= 2, y >= 3
/// 2. Vx -> V2 where x >= 2, in later 7.5
/// 3. V2(7.5.x) -> V2(7.5.0), if no extensions. V2 may inherit some extensions from upper version, and failed to clean it before downgrade to 7.5.0.
RegionPtr Region::deserializeImpl(
UInt32 current_version,
std::function<bool(UInt32, ReadBuffer &, UInt32)> extra_handler,
ReadBuffer & buf,
const TiFlashRaftProxyHelper * proxy_helper)
{
const auto binary_version = readBinary2<UInt32>(buf);
if (current_version <= 1 && binary_version > current_version)
{
// Conform to https://github.com/pingcap/tiflash/blob/43f809fffde22d0af4c519be4546a5bf4dde30a2/dbms/src/Storages/KVStore/Region.cpp#L197
// When downgrade from x(where x > 1) -> 1, the old version will throw with "unexpected version".
// So we will also throw here.
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Don't support downgrading from {} to {}",
binary_version,
current_version);
}
const auto binary_version_decoded = magic_enum::enum_cast<RegionPersistVersion>(binary_version);
if (!binary_version_decoded.has_value())
{
LOG_DEBUG(DB::Logger::get(), "Maybe downgrade from {} to {}", binary_version, current_version);
}

// Deserialize meta
RegionPtr region = std::make_shared<Region>(RegionMeta::deserialize(buf), proxy_helper);

// Try deserialize flag
if (binary_version >= 2)
{
auto flags = readBinary2<UInt32>(buf);
if ((flags & RegionPersistFormat::HAS_EAGER_TRUNCATE_INDEX) != 0)
{
region->eager_truncated_index = readBinary2<UInt64>(buf);
}
UInt32 extension_cnt = flags >> 1;
for (UInt32 i = 0; i < extension_cnt; i++)
{
auto [extension_type, length] = getPersistExtensionTypeAndLength(buf);
// Used in tests.
if (extra_handler(extension_type, buf, length))
continue;
// Throw away unknown extension data
if (extension_type >= magic_enum::enum_underlying(RegionPersistExtension::MaxKnownFlag))
{
buf.ignore(length);
continue;
}

RUNTIME_CHECK_MSG(false, "Unhandled extension, type={} length={}", extension_type, length);
}
}

// deserialize data
RegionData::deserialize(buf, region->data);

// restore other var according to meta
region->last_restart_log_applied = region->appliedIndex();
region->setLastCompactLogApplied(region->appliedIndex());
return region;
}

} // namespace DB
66 changes: 66 additions & 0 deletions dbms/src/Storages/KVStore/MultiRaft/RegionSerde.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

namespace DB
{
using MaybeRegionPersistExtension = UInt32;
enum class RegionPersistVersion
{
V1 = 1,
V2, // For eager gc
};

namespace RegionPersistFormat
{
static constexpr UInt32 HAS_EAGER_TRUNCATE_INDEX = 0x01;
// The upper bits are used to store length of extensions. DO NOT USE!
} // namespace RegionPersistFormat

// The RegionPersistExtension has nothing to do with `version`.
// No matter upgrading or downgrading, we parse a `MaybeRegionPersistExtension` if we KNOW this field.
// We KNOW this field if it is LESS THAN `MaxKnownFlag`, so there should be NO hole before `MaxKnownFlag`.
// Once a extension is registered, what it's stand for shouldn't be changed. E.g. if Ext1 is assigned to 10, then in any older or newer version, we can't assign another Ext2 to 10.
enum class RegionPersistExtension : MaybeRegionPersistExtension
{
ReservedForTest = 1,
// It should always be equal to the maximum supported type + 1
MaxKnownFlag = 2,
};

/// The flexible pattern
/// The `payload 1` is of length defined by `length 1`
/// |--------- 32 bits ----------|
/// |- 31b exts -|- 1b eager gc -|
/// |--------- eager gc ---------|
/// |--------- eager gc ---------|
/// |-------- ext type 1 --------|
/// |--------- length 1 ---------|
/// |--------- payload 1 --------|
/// |--------- ......... --------|
/// |-------- ext type n --------|
/// |--------- length n ---------|
/// |--------- payload n --------|

constexpr MaybeRegionPersistExtension UNUSED_EXTENSION_NUMBER_FOR_TEST = UINT32_MAX / 2;
static_assert(!magic_enum::enum_contains<RegionPersistExtension>(UNUSED_EXTENSION_NUMBER_FOR_TEST));
static_assert(std::is_same_v<MaybeRegionPersistExtension, UInt32>);
static_assert(magic_enum::enum_underlying(RegionPersistExtension::MaxKnownFlag) <= UINT32_MAX / 2);
static_assert(
magic_enum::enum_count<RegionPersistExtension>()
== magic_enum::enum_underlying(RegionPersistExtension::MaxKnownFlag));
static_assert(RegionPersistFormat::HAS_EAGER_TRUNCATE_INDEX == 0x01);

} // namespace DB
Loading

0 comments on commit 452bf83

Please sign in to comment.