Skip to content

Commit

Permalink
Fix license resource check when reduce the LM capacity (vesoft-inc#2644)
Browse files Browse the repository at this point in the history
* Fix license resource check when reduce the LM capacity

* Rename graph to query in log

* update check period

---------

Co-authored-by: Sophie <84560950+Sophie-Xie@users.noreply.github.com>
  • Loading branch information
Aiee and Sophie-Xie authored Apr 18, 2023
1 parent d7031e5 commit 3cbf5ce
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 24 deletions.
46 changes: 25 additions & 21 deletions src/common/encryption/LicenseManagerConnector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,10 @@ void LicenseManagerConnector::threadFunc() {
auto rse = getLMId();
if (!rse.ok()) {
LOG(ERROR) << "[License Manager] failed to get license manager ID, error: " << rse.toString();
nextCheckPeriod = kRetryPeriodInSec + folly::Random::rand32(5 * 60);
// nextCheckPeriod = kRetryPeriodInSec + folly::Random::rand32(5 * 60);
// TODO(Aiee) for test only
nextCheckPeriod = 5;
DLOG(INFO) << "[License Manager] next check period: " << nextCheckPeriod;
setRetryFlag();
} else {
auto lmStatus = validateLicense();
Expand Down Expand Up @@ -175,7 +178,10 @@ void LicenseManagerConnector::threadFunc() {
dropAllHosts_ = true;
}
setRetryFlag();
nextCheckPeriod = kRetryPeriodInSec + folly::Random::rand32(5 * 60);
// nextCheckPeriod = kRetryPeriodInSec + folly::Random::rand32(5 * 60);
// TODO(Aiee) for test only
nextCheckPeriod = 10;
DLOG(INFO) << "[License Manager] next check period: " << nextCheckPeriod;
}
}
}
Expand All @@ -186,9 +192,6 @@ void LicenseManagerConnector::threadFunc() {
LOG(ERROR) << "[License Manager] Failed to validate license with license manager, retry "
"timeout, all hosts will be shut down";
}

// TODO(Aiee) for test only, should be removed before official v3.5 release
nextCheckPeriod = 10;
}

StatusOr<std::string> LicenseManagerConnector::buildValidateRequest(const std::string& lmId,
Expand Down Expand Up @@ -225,14 +228,15 @@ ErrorOr<nebula::cpp2::ErrorCode, std::string> LicenseManagerConnector::buildRawR
if (!nebula::ok(res)) {
return nebula::error(res);
}
auto graphNode = nebula::value(res).size();
auto queryNode = nebula::value(res).size();

// Get total graph cpu
auto graphCPU = 0;
// Get total query CPU
auto queryCPU = 0;
for (auto& host : nebula::value(res)) {
graphCPU += host.cpuNum_;
queryCPU += host.cpuNum_;
}
VLOG(2) << "Total graph node number: " << graphNode << ", graph CPU cores: " << graphCPU;
VLOG(2) << "[License Manager] Total query node number: " << queryNode
<< ", query CPU cores: " << queryCPU;

// Get active storage hosts
res = meta::ActiveHostsMan::getHostInfoByRole(kvstore, meta::cpp2::HostRole::STORAGE);
Expand All @@ -246,17 +250,18 @@ ErrorOr<nebula::cpp2::ErrorCode, std::string> LicenseManagerConnector::buildRawR
for (auto& host : nebula::value(res)) {
storageCPU += host.cpuNum_;
}
VLOG(2) << "Total storage node number: " << storageNode << ", storage CPU cores: " << storageCPU;
VLOG(2) << "[License Manager] Total storage node number: " << storageNode
<< ", storage CPU cores: " << storageCPU;

folly::dynamic request = folly::dynamic::object();
auto timestamp = time::WallClock::fastNowInSec();

request["timestamp"] = timestamp;

folly::dynamic apply = folly::dynamic::object();
apply["graphCPU"] = graphCPU;
apply["queryCPU"] = queryCPU;
apply["storageCPU"] = storageCPU;
apply["graphNode"] = graphNode;
apply["queryNode"] = queryNode;
apply["storageNode"] = storageNode;
request["apply"] = apply;
DLOG(INFO) << "Request body in string: \n" << folly::toJson(request);
Expand Down Expand Up @@ -496,10 +501,9 @@ LMStatus LicenseManagerConnector::checkRawResponse(const std::string& rawResp) {
LOG(INFO) << "[License Manager] License validation request at " << localTimeStamp;

if (std::abs(timestamp - localTimeStamp) > 21600) {
LOG(ERROR) << "[License Manager] Invalid response from license manager, the timestamps from "
"the License manager "
"is is "
<< timestamp << ", NebulaGraph timestamp is " << localTimeStamp;
LOG(ERROR) << "[License Manager] Expired response from license manager, the timestamps from "
"the License manager is "
<< timestamp << ", query timestamp is " << localTimeStamp;
return LMStatus::ErrRequestExpired;
}

Expand All @@ -510,8 +514,8 @@ LMStatus LicenseManagerConnector::checkRawResponse(const std::string& rawResp) {
resourceUsage_.storageQuota = rawInfo["quota"]["storage"].asInt();
}

VLOG(2) << "Quota resource type: " << resourceUsage_.type << "\n"
<< "max graph quota: " << resourceUsage_.graphQuota << "\n"
VLOG(2) << "[License Manager] Quota resource type: " << resourceUsage_.type << "\n"
<< "max query quota: " << resourceUsage_.graphQuota << "\n"
<< "max storage quota: " << resourceUsage_.storageQuota;
auto overflowFlag = rawInfo["overflow"].asBool();
auto lmStatus = handleResponseStatus(status, overflowFlag);
Expand Down Expand Up @@ -539,12 +543,12 @@ LMStatus LicenseManagerConnector::handleResponseStatus(const std::string& messag
LOG(WARNING) << "[License Manager] Resource usage has exceeded the license limit";
return LMStatus::Overflow;
}
LOG(ERROR) << "[License Manager] License has expired, all graph and storage services will be "
LOG(ERROR) << "[License Manager] License has expired, all query and storage services will be "
"terminated soon, "
"please contact your administrator to renew the license";
return LMStatus::Expired;
} else if (message == "Terminated") {
LOG(ERROR) << "[License Manager] The license has expired, all graph and storage services are "
LOG(ERROR) << "[License Manager] The license has expired, all query and storage services are "
"terminated, "
"please contact your administrator to renew the license";
return LMStatus::Terminated;
Expand Down
4 changes: 2 additions & 2 deletions src/common/encryption/LicenseManagerConnector.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ class LicenseManagerConnector final {
// {
// timestamp: 111111111111,
// apply: {
// graphCPU: 100,
// queryCPU: 100,
// storageCPU: 100,
// graphNode: 10,
// queryNode: 10,
// storageNode: 10
// }
// }
Expand Down
2 changes: 2 additions & 0 deletions src/meta/ActiveHostsMan.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ class ActiveHostsMan final {

/**
* @brief Get all alive host info by given host role
* This is used in enterprise version to fetch the current cluster status, and
* the ttl is set to 1 heartbeat interval to
*
* @param kv From where to get
* @param hostRole
Expand Down
34 changes: 33 additions & 1 deletion src/meta/processors/admin/HBProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ nebula::cpp2::ErrorCode HBProcessor::checkNodeNumber(const cpp2::HostRole role,
}

if (!pass) {
// Remove the host from active hosts
auto removeHostRet = removeHost(host);
if (removeHostRet != nebula::cpp2::ErrorCode::SUCCEEDED) {
LOG(ERROR) << fmt::format("Remove host {} failed, error code: {}",
host.toString(),
apache::thrift::util::enumNameSafe(removeHostRet));
}

LOG(ERROR) << fmt::format(
"The number of {} node has reached the maximum, the max number of {} node in the cluster "
"is {}, heartbeat from {} is rejected",
Expand Down Expand Up @@ -237,6 +245,14 @@ nebula::cpp2::ErrorCode HBProcessor::checkNodeCpu(const cpp2::HostRole role,
}

if (!pass) {
// Remove the host from active hosts
auto removeHostRet = removeHost(host);
if (removeHostRet != nebula::cpp2::ErrorCode::SUCCEEDED) {
LOG(ERROR) << fmt::format("Remove host {} failed, error code: {}",
host.toString(),
apache::thrift::util::enumNameSafe(removeHostRet));
}

LOG(ERROR) << fmt::format(
"[License Manager] The number of {} node CPU cores has reached the maximum, CPU core "
"maximum: {}, heartbeat from {} is rejected",
Expand Down Expand Up @@ -294,7 +310,7 @@ nebula::cpp2::ErrorCode HBProcessor::checkResourceUsage(const cpp2::HostRole rol

auto resourceType = LMCIns->resourceUsage_.type;
DLOG(INFO) << "[License Manager] Resource type: " << resourceType
<< ", graph resource cap: " << LMCIns->resourceUsage_.graphQuota
<< ", query resource cap: " << LMCIns->resourceUsage_.graphQuota
<< ", storage resource cap: " << LMCIns->resourceUsage_.storageQuota;

if (resourceType == "CPU") {
Expand All @@ -309,6 +325,22 @@ nebula::cpp2::ErrorCode HBProcessor::checkResourceUsage(const cpp2::HostRole rol
return nebula::cpp2::ErrorCode::SUCCEEDED;
}

nebula::cpp2::ErrorCode HBProcessor::removeHost(const HostAddr& host) {
auto hostKey = MetaKeyUtils::hostKey(host.host, host.port);
folly::Baton<true, std::atomic> baton;
nebula::cpp2::ErrorCode errorCode;
kvstore_->asyncRemove(kDefaultSpaceId,
kDefaultPartId,
hostKey,
[this, &baton, &errorCode](nebula::cpp2::ErrorCode code) {
this->handleErrorCode(code);
errorCode = code;
baton.post();
});
baton.wait();
return errorCode;
}

void HBProcessor::setLeaderInfo() {
auto leaderRet = kvstore_->partLeader(kDefaultSpaceId, kDefaultPartId);
if (ok(leaderRet)) {
Expand Down
4 changes: 4 additions & 0 deletions src/meta/processors/admin/HBProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ class HBProcessor : public BaseProcessor<cpp2::HBResp> {

void setLeaderInfo();

// enterprise only
// Delete the host key from meta data to remove the host from active hosts
nebula::cpp2::ErrorCode removeHost(const HostAddr& host);

ClusterID clusterId_{0};
const HBCounters* counters_{nullptr};
static std::atomic<int64_t> metaVersion_;
Expand Down

0 comments on commit 3cbf5ce

Please sign in to comment.