diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index 00bf99f296d0..8a585cac2ab6 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -256,6 +256,10 @@ struct THiveSharedSettings { TDuration GetStoragePoolFreshPeriod() const { return TDuration::MilliSeconds(CurrentConfig.GetStoragePoolFreshPeriod()); } + + double GetMinGroupUsageToBalance() const { + return CurrentConfig.GetMinGroupUsageToBalance(); + } }; struct TDrainSettings { @@ -276,7 +280,7 @@ struct TBalancerSettings { struct TStorageBalancerSettings { ui64 NumReassigns; - ui64 MaxInFlight; + ui64 MaxInFlight = 1; TString StoragePool; }; diff --git a/ydb/core/mind/hive/hive_events.h b/ydb/core/mind/hive/hive_events.h index 875bf731bec9..a644b793a7cb 100644 --- a/ydb/core/mind/hive/hive_events.h +++ b/ydb/core/mind/hive/hive_events.h @@ -29,6 +29,8 @@ struct TEvPrivate { EvLogTabletMoves, EvStartStorageBalancer, EvRestartCancelled, + EvProcessStorageBalancer, + EvStorageBalancerOut, EvEnd }; @@ -104,6 +106,10 @@ struct TEvPrivate { TEvRestartCancelled(TFullTabletId tabletId) : TabletId(tabletId) {} }; + + struct TEvProcessStorageBalancer : TEventLocal {}; + + struct TEvStorageBalancerOut : TEventLocal {}; }; } // NHive diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 997f381ae19d..76fdcdeb1c92 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -426,6 +426,9 @@ void THive::Handle(TEvBlobStorage::TEvControllerSelectGroupsResult::TPtr& ev) { Execute(CreateUpdateTabletGroups(tabletId)); } } + if (tablets.empty()) { + ProcessStorageBalancer(); + } } else { BLOG_ERROR("THive::Handle TEvControllerSelectGroupsResult: obsolete BSC response"); } @@ -2051,12 +2054,12 @@ void THive::Handle(TEvHive::TEvRequestHiveStorageStats::TPtr& ev) { auto& pbGroup = *pbPool.AddGroups(); pbGroup.SetGroupID(id); pbGroup.SetAcquiredUnits(group.Units.size()); - pbGroup.SetAcquiredIOPS(group.AcquiredIOPS); - pbGroup.SetAcquiredThroughput(group.AcquiredThroughput); - pbGroup.SetAcquiredSize(group.AcquiredSize); - pbGroup.SetMaximumIOPS(group.MaximumIOPS); - pbGroup.SetMaximumThroughput(group.MaximumThroughput); - pbGroup.SetMaximumSize(group.MaximumSize); + pbGroup.SetAcquiredIOPS(group.AcquiredResources.IOPS); + pbGroup.SetAcquiredThroughput(group.AcquiredResources.Throughput); + pbGroup.SetAcquiredSize(group.AcquiredResources.Size); + pbGroup.SetMaximumIOPS(group.MaximumResources.IOPS); + pbGroup.SetMaximumThroughput(group.MaximumResources.Throughput); + pbGroup.SetMaximumSize(group.MaximumResources.Size); pbGroup.SetAllocatedSize(group.GroupParameters.GetAllocatedSize()); pbGroup.SetAvailableSize(group.GroupParameters.GetAvailableSize()); } @@ -2171,11 +2174,18 @@ TResourceRawValues THive::GetDefaultResourceInitialMaximumValues() { void THive::ProcessTabletBalancer() { if (!ProcessTabletBalancerScheduled && !ProcessTabletBalancerPostponed && BootQueue.BootQueue.empty()) { - Schedule(GetBalancerCooldown(), new TEvPrivate::TEvProcessTabletBalancer()); + Schedule(GetBalancerCooldown(LastBalancerTrigger), new TEvPrivate::TEvProcessTabletBalancer()); ProcessTabletBalancerScheduled = true; } } +void THive::ProcessStorageBalancer() { + if (!ProcessStorageBalancerScheduled && BootQueue.BootQueue.empty()) { + Schedule(GetBalancerCooldown(EBalancerType::Storage), new TEvPrivate::TEvProcessStorageBalancer()); + ProcessStorageBalancerScheduled = true; + } +} + THive::THiveStats THive::GetStats() const { THiveStats stats = {}; stats.Values.reserve(Nodes.size()); @@ -2203,7 +2213,6 @@ THive::THiveStats THive::GetStats() const { maxValues = piecewise_max(maxValues, stats.Values[i].ResourceNormValues); } - auto minValuesToBalance = GetMinNodeUsageToBalance(); maxValues = piecewise_max(maxValues, minValuesToBalance); minValues = piecewise_max(minValues, minValuesToBalance); @@ -2359,6 +2368,38 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) { Send(SelfId(), new TEvPrivate::TEvBalancerOut()); } +void THive::Handle(TEvPrivate::TEvProcessStorageBalancer::TPtr&) { + ProcessStorageBalancerScheduled = false; + if (StoragePools.empty()) { + return; + } + using TPoolStat = std::pair; + std::vector poolStats; + poolStats.reserve(StoragePools.size()); + for (const auto& [name, pool] : StoragePools) { + poolStats.emplace_back(pool.GetStats(), pool); + } + auto& [stats, pool] = *std::max_element(poolStats.begin(), poolStats.end(), [](const TPoolStat& lhs, const TPoolStat& rhs) { + return lhs.first.Scatter < rhs.first.Scatter; + }); + if (stats.Scatter > GetMinStorageScatterToBalance()) { + BLOG_D("Storage Scatter = " << stats.Scatter << " in pool " << pool.Name << ", starting StorageBalancer"); + ui64 numReassigns = 1; + auto it = pool.Groups.find(stats.MaxUsageGroupId); + if (it != pool.Groups.end()) { + // We want a ballpark estimate of how many reassigns it would take to balance the pool + // Using the number of units in the most loaded group ensures we won't reassign the whole pool on a whim, + // while also giving the balancer some room to work. + // Note that the balancer is not actually required to do that many reassigns, but will never do more + numReassigns = it->second.Units.size(); + } + StartHiveStorageBalancer({ + .NumReassigns = numReassigns, + .StoragePool = pool.Name + }); + } +} + void THive::UpdateTotalResourceValues( const TNodeInfo* node, const TTabletInfo* tablet, @@ -2660,8 +2701,8 @@ void THive::UpdateTabletFollowersNumber(TLeaderTabletInfo& tablet, NIceDb::TNice } } -TDuration THive::GetBalancerCooldown() const { - switch(LastBalancerTrigger) { +TDuration THive::GetBalancerCooldown(EBalancerType balancerType) const { + switch(balancerType) { case EBalancerType::Scatter: case EBalancerType::ScatterCounter: case EBalancerType::ScatterCPU: @@ -2787,7 +2828,7 @@ void THive::RequestPoolsInformation() { } SendToBSControllerPipe(ev.Release()); } - Schedule(TDuration::Minutes(10), new TEvPrivate::TEvRefreshStorageInfo()); + Schedule(GetStorageInfoRefreshFrequency(), new TEvPrivate::TEvRefreshStorageInfo()); } ui32 THive::GetEventPriority(IEventHandle* ev) { @@ -2880,6 +2921,7 @@ void THive::ProcessEvent(std::unique_ptr event) { hFunc(TEvPrivate::TEvRefreshStorageInfo, Handle); hFunc(TEvPrivate::TEvLogTabletMoves, Handle); hFunc(TEvPrivate::TEvStartStorageBalancer, Handle); + hFunc(TEvPrivate::TEvProcessStorageBalancer, Handle); hFunc(TEvHive::TEvUpdateDomain, Handle); } } @@ -2980,6 +3022,7 @@ STFUNC(THive::StateWork) { fFunc(TEvPrivate::TEvLogTabletMoves::EventType, EnqueueIncomingEvent); fFunc(TEvPrivate::TEvStartStorageBalancer::EventType, EnqueueIncomingEvent); fFunc(TEvHive::TEvUpdateDomain::EventType, EnqueueIncomingEvent); + fFunc(TEvPrivate::TEvProcessStorageBalancer::EventType, EnqueueIncomingEvent); hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle); default: if (!HandleDefaultEvents(ev, SelfId())) { diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 14bd3f51056f..33da8f8f31e8 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -392,6 +392,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar bool ProcessTabletBalancerPostponed = false; bool ProcessPendingOperationsScheduled = false; bool LogTabletMovesScheduled = false; + bool ProcessStorageBalancerScheduled = false; TResourceRawValues TotalRawResourceValues = {}; TResourceNormalizedValues TotalNormalizedResourceValues = {}; TInstant LastResourceChangeReaction; @@ -556,6 +557,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar void Handle(TEvPrivate::TEvRefreshStorageInfo::TPtr& ev); void Handle(TEvPrivate::TEvLogTabletMoves::TPtr& ev); void Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev); + void Handle(TEvPrivate::TEvProcessStorageBalancer::TPtr& ev); void Handle(TEvPrivate::TEvProcessIncomingEvent::TPtr& ev); void Handle(TEvHive::TEvUpdateDomain::TPtr& ev); @@ -653,6 +655,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar void PostponeProcessBootQueue(TDuration after); void ProcessPendingOperations(); void ProcessTabletBalancer(); + void ProcessStorageBalancer(); const TVector& GetTabletTypeAllowedMetricIds(TTabletTypes::EType type) const; static const TVector& GetDefaultAllowedMetricIdsForType(TTabletTypes::EType type); static bool IsValidMetrics(const NKikimrTabletBase::TMetrics& metrics); @@ -681,7 +684,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar void StopTablet(const TActorId& local, TFullTabletId tabletId); void ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffects); void UpdateTabletFollowersNumber(TLeaderTabletInfo& tablet, NIceDb::TNiceDb& db, TSideEffects& sideEffects); - TDuration GetBalancerCooldown() const; + TDuration GetBalancerCooldown(EBalancerType balancerType) const; void UpdateObjectCount(const TLeaderTabletInfo& tablet, const TNodeInfo& node, i64 diff); ui64 GetObjectImbalance(TFullObjectId object); @@ -914,6 +917,14 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar return CurrentConfig.GetMaxChannelHistorySize(); } + TDuration GetStorageInfoRefreshFrequency() const { + return TDuration::MilliSeconds(CurrentConfig.GetStorageInfoRefreshFrequency()); + } + + double GetMinStorageScatterToBalance() const { + return CurrentConfig.GetMinStorageScatterToBalance(); + } + static void ActualizeRestartStatistics(google::protobuf::RepeatedField& restartTimestamps, ui64 barrier); static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField& restartTimestamps, ui64 barrier); static bool IsSystemTablet(TTabletTypes::EType type); diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index b28ee8db4b96..e7dc6c4b8080 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -2785,6 +2785,7 @@ Y_UNIT_TEST_SUITE(THiveTest) { TTestBasicRuntime runtime(1, false); Setup(runtime, true, 2, [](TAppPrepare& app) { app.HiveConfig.SetMinPeriodBetweenReassign(0); + app.HiveConfig.SetStorageInfoRefreshFrequency(200); }); const ui64 hiveTablet = MakeDefaultHiveID(0); const ui64 testerTablet = MakeDefaultHiveID(1); @@ -2830,41 +2831,33 @@ Y_UNIT_TEST_SUITE(THiveTest) { } // If assured space is not set, usage is always set to 1 - auto groupMetricsExchange = MakeHolder(); - for (const auto& [group, tablets] : groupToTablets) { - NKikimrBlobStorage::TGroupMetrics* metrics = groupMetricsExchange->Record.AddGroupMetrics(); + auto updateDiskStatus = MakeHolder(); - metrics->SetGroupId(group); - metrics->MutableGroupParameters()->SetGroupID(group); - metrics->MutableGroupParameters()->SetStoragePoolName("def1"); - metrics->MutableGroupParameters()->MutableAssuredResources()->SetSpace(300'000'000); - } + for (ui32 groupId = 0x80000000; groupId < 0x8000000a; ++groupId) { + NKikimrBlobStorage::TVDiskMetrics* vdiskMetrics = updateDiskStatus->Record.AddVDisksMetrics(); + + vdiskMetrics->MutableVDiskId()->SetGroupID(groupId); + vdiskMetrics->MutableVDiskId()->SetGroupGeneration(1); + vdiskMetrics->MutableVDiskId()->SetRing(0); + vdiskMetrics->MutableVDiskId()->SetDomain(0); + vdiskMetrics->MutableVDiskId()->SetVDisk(0); + vdiskMetrics->SetAvailableSize(30'000'000); - runtime.SendToPipe(MakeBSControllerID(0), sender, groupMetricsExchange.Release(), 0, GetPipeConfigWithRetries()); - { - TDispatchOptions options; - options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerGroupMetricsExchange)); - runtime.DispatchEvents(options); } + runtime.SendToPipe(MakeBSControllerID(0), sender, updateDiskStatus.Release(), 0, GetPipeConfigWithRetries()); + TChannelsBindings channels = BINDED_CHANNELS; - for (auto& bind : channels) { - bind.SetSize(200'000'000); - } + channels[0].SetSize(500'000'000); for (auto tablet : {tabletA, tabletB}) { TAutoPtr updateTablet(new TEvHive::TEvCreateTablet(testerTablet, 100500 + (tablet - tabletBase), tabletType, channels)); SendCreateTestTablet(runtime, hiveTablet, testerTablet, updateTablet, 0, true); } - runtime.SendToPipe(hiveTablet, sender, new NHive::TEvPrivate::TEvStartStorageBalancer({ - .NumReassigns = 100, - .MaxInFlight = 1, - .StoragePool = "def1", - })); { TDispatchOptions options; - options.FinalEvents.emplace_back(NHive::TEvPrivate::EvRestartComplete, 4); // should actually be less than 4 - runtime.DispatchEvents(options, TDuration::Seconds(10)); + options.FinalEvents.emplace_back(NHive::TEvPrivate::EvStorageBalancerOut); + runtime.DispatchEvents(options, TDuration::Minutes(1)); } UNIT_ASSERT_VALUES_UNEQUAL(getGroup(tabletA), getGroup(tabletB)); diff --git a/ydb/core/mind/hive/leader_tablet_info.cpp b/ydb/core/mind/hive/leader_tablet_info.cpp index 43a6e260eb25..e4f151ad7cb1 100644 --- a/ydb/core/mind/hive/leader_tablet_info.cpp +++ b/ydb/core/mind/hive/leader_tablet_info.cpp @@ -262,9 +262,14 @@ const NKikimrBlobStorage::TEvControllerSelectGroupsResult::TGroupParameters* TLe break; } case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE: { - return storagePool->FindFreeAllocationUnit([¶ms](const TStorageGroupInfo& newGroup) -> bool { + auto channel = GetChannel(channelId); + auto filter = [¶ms](const TStorageGroupInfo& newGroup) -> bool { return newGroup.IsMatchesParameters(*params); - }); + }; + auto calculateUsageWithTablet = [&channel](const TStorageGroupInfo* newGroup) -> double { + return newGroup->GetUsageForChannel(channel); + }; + return storagePool->FindFreeAllocationUnit(filter, calculateUsageWithTablet); break; } case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_SPACE: { diff --git a/ydb/core/mind/hive/leader_tablet_info.h b/ydb/core/mind/hive/leader_tablet_info.h index dba2b570d58d..6f0aa980dfeb 100644 --- a/ydb/core/mind/hive/leader_tablet_info.h +++ b/ydb/core/mind/hive/leader_tablet_info.h @@ -43,6 +43,10 @@ struct TLeaderTabletInfo : TTabletInfo { return ChannelInfo->GetSize(); } } + + bool operator==(const TChannel& other) const { + return TabletId == other.TabletId && ChannelId == other.ChannelId; + } }; TTabletId Id; diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index f7a27621cc4b..dec113c569ad 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -3933,8 +3933,8 @@ class TTxMonEvent_Storage : public TTransactionBase { using TKindMap = THashMap; void GetUnitKinds(const TStorageGroupInfo& group, TKindMap& kinds) { - for (const auto& [tablet, channel] : group.Units) { - const auto& boundChannels(tablet->BoundChannels[channel]); + for (const auto& channel : group.Units) { + const auto& boundChannels(*channel.ChannelInfo); kinds[TUnitKind{boundChannels.GetIOPS(), boundChannels.GetThroughput(), boundChannels.GetSize()}]++; } } @@ -3970,11 +3970,11 @@ class TTxMonEvent_Storage : public TTransactionBase { out << "" << kind.ToString() << ""; out << "" << units << ""; out << "" << Sprintf("%.2f", kind.IOPS * units) << ""; - out << "" << Sprintf("%.2f", prStorageGroup.second.MaximumIOPS) << ""; + out << "" << Sprintf("%.2f", prStorageGroup.second.MaximumResources.IOPS) << ""; out << "" << kind.Size * units << ""; - out << "" << prStorageGroup.second.MaximumSize << ""; + out << "" << prStorageGroup.second.MaximumResources.Size << ""; out << "" << kind.Throughput * units << ""; - out << "" << prStorageGroup.second.MaximumThroughput << ""; + out << "" << prStorageGroup.second.MaximumResources.Throughput << ""; out << "" << Sprintf("%.2f", prStorageGroup.second.StoragePool.GetOvercommit()) << ""; out << "" << Sprintf("%.2f", prStorageGroup.second.GetUsage()) << ""; out << ""; @@ -3985,12 +3985,12 @@ class TTxMonEvent_Storage : public TTransactionBase { out << "" << prStoragePool.second.Name << ""; out << "" << group.Id << ""; out << "" << group.Units.size() << ""; - out << "" << Sprintf("%.2f", group.AcquiredIOPS) << ""; - out << "" << Sprintf("%.2f", group.MaximumIOPS) << ""; - out << "" << group.AcquiredSize << ""; - out << "" << group.MaximumSize << ""; - out << "" << group.AcquiredThroughput << ""; - out << "" << group.MaximumThroughput << ""; + out << "" << Sprintf("%.2f", group.AcquiredResources.IOPS) << ""; + out << "" << Sprintf("%.2f", group.MaximumResources.IOPS) << ""; + out << "" << group.AcquiredResources.Size << ""; + out << "" << group.MaximumResources.Size << ""; + out << "" << group.AcquiredResources.Throughput << ""; + out << "" << group.MaximumResources.Throughput << ""; out << "" << group.GroupParameters.GetAllocatedSize() << ""; out << "" << group.GroupParameters.GetAvailableSize() << ""; out << "" << Sprintf("%.2f", group.StoragePool.GetOvercommit()) << ""; diff --git a/ydb/core/mind/hive/storage_balancer.cpp b/ydb/core/mind/hive/storage_balancer.cpp index a7590e420a9b..ee112566ac55 100644 --- a/ydb/core/mind/hive/storage_balancer.cpp +++ b/ydb/core/mind/hive/storage_balancer.cpp @@ -66,6 +66,7 @@ class THiveStorageBalancer : public NActors::TActorBootstrappedRemoveSubActor(this); + Send(Hive->SelfId(), new TEvPrivate::TEvStorageBalancerOut()); return IActor::PassAway(); } diff --git a/ydb/core/mind/hive/storage_group_info.cpp b/ydb/core/mind/hive/storage_group_info.cpp index 9a0eb2cd1a32..c2ad1e3a69a7 100644 --- a/ydb/core/mind/hive/storage_group_info.cpp +++ b/ydb/core/mind/hive/storage_group_info.cpp @@ -9,37 +9,33 @@ TStorageGroupInfo::TStorageGroupInfo(const TStoragePoolInfo& storagePool, TStora , Id(id) {} -bool TStorageGroupInfo::AcquireAllocationUnit(const TLeaderTabletInfo* tablet, ui32 channel) { - Y_ABORT_UNLESS(tablet->BoundChannels.size() > channel); - bool acquired = Units.insert({tablet, channel}).second; +bool TStorageGroupInfo::AcquireAllocationUnit(const TLeaderTabletInfo::TChannel& channel) { + Y_ABORT_UNLESS(channel.ChannelInfo); + bool acquired = Units.insert(channel).second; if (acquired) { - AcquiredIOPS += tablet->BoundChannels[channel].GetIOPS(); - AcquiredThroughput += tablet->BoundChannels[channel].GetThroughput(); - AcquiredSize += tablet->BoundChannels[channel].GetSize(); + AcquiredResources.Add(channel); } return acquired; } -bool TStorageGroupInfo::ReleaseAllocationUnit(const TLeaderTabletInfo* tablet, ui32 channel) { - Y_ABORT_UNLESS(tablet->BoundChannels.size() > channel); - bool released = Units.erase({tablet, channel}) != 0; +bool TStorageGroupInfo::ReleaseAllocationUnit(const TLeaderTabletInfo::TChannel& channel) { + Y_ABORT_UNLESS(channel.ChannelInfo); + bool released = Units.erase(channel) != 0; if (released) { - AcquiredIOPS -= tablet->BoundChannels[channel].GetIOPS(); - AcquiredThroughput -= tablet->BoundChannels[channel].GetThroughput(); - AcquiredSize -= tablet->BoundChannels[channel].GetSize(); + AcquiredResources.Subtract(channel); } return released; } void TStorageGroupInfo::UpdateStorageGroup(const TEvControllerSelectGroupsResult::TGroupParameters& groupParameters) { if (groupParameters.GetAssuredResources().HasIOPS()) { - MaximumIOPS = groupParameters.GetAssuredResources().GetIOPS(); + MaximumResources.IOPS = groupParameters.GetAssuredResources().GetIOPS(); } if (groupParameters.GetAssuredResources().HasReadThroughput() || groupParameters.GetAssuredResources().HasWriteThroughput()) { - MaximumThroughput = groupParameters.GetAssuredResources().GetReadThroughput() + groupParameters.GetAssuredResources().GetWriteThroughput(); + MaximumResources.Throughput = groupParameters.GetAssuredResources().GetReadThroughput() + groupParameters.GetAssuredResources().GetWriteThroughput(); } if (groupParameters.GetAssuredResources().HasSpace()) { - MaximumSize = groupParameters.GetAssuredResources().GetSpace(); + MaximumResources.Size = groupParameters.GetAssuredResources().GetSpace(); } GroupParameters.CopyFrom(groupParameters); } @@ -58,31 +54,31 @@ bool TStorageGroupInfo::IsMatchesParameters(const TGroupFilter& filter) const { if (StoragePool.GetSafeMode()) { return true; } - if (IsBalanceByIOPS() && groupParameters.HasRequiredIOPS() && groupParameters.GetRequiredIOPS() + AcquiredIOPS > GetMaximumIOPS()) { + if (IsBalanceByIOPS() && groupParameters.HasRequiredIOPS() && groupParameters.GetRequiredIOPS() + AcquiredResources.IOPS > GetMaximumIOPS()) { return false; } - if (IsBalanceByThroughput() && groupParameters.HasRequiredThroughput() && groupParameters.GetRequiredThroughput() + AcquiredThroughput > GetMaximumThroughput()) { + if (IsBalanceByThroughput() && groupParameters.HasRequiredThroughput() && groupParameters.GetRequiredThroughput() + AcquiredResources.Throughput > GetMaximumThroughput()) { return false; } - if (IsBalanceBySize() && groupParameters.HasRequiredDataSize() && groupParameters.GetRequiredDataSize() + AcquiredSize > GetMaximumSize()) { + if (IsBalanceBySize() && groupParameters.HasRequiredDataSize() && groupParameters.GetRequiredDataSize() + AcquiredResources.Size > GetMaximumSize()) { return false; } return true; } -double TStorageGroupInfo::GetUsage() const { +double TStorageGroupInfo::GetUsage(const TStorageResources& resources) const { double usage = 0; int countUsage = 0; - if (IsBalanceByIOPS() && MaximumIOPS > 0) { - usage = std::max(usage, static_cast(AcquiredIOPS) / GetMaximumIOPS()); + if (IsBalanceByIOPS() && MaximumResources.IOPS > 0) { + usage = std::max(usage, static_cast(resources.IOPS) / GetMaximumIOPS()); ++countUsage; } - if (IsBalanceByThroughput() && MaximumThroughput > 0) { - usage = std::max(usage, static_cast(AcquiredThroughput) / GetMaximumThroughput()); + if (IsBalanceByThroughput() && MaximumResources.Throughput > 0) { + usage = std::max(usage, static_cast(resources.Throughput) / GetMaximumThroughput()); ++countUsage; } - if (IsBalanceBySize() && MaximumSize > 0) { - usage = std::max(usage, static_cast(AcquiredSize) / GetMaximumSize()); + if (IsBalanceBySize() && MaximumResources.Size > 0) { + usage = std::max(usage, static_cast(resources.Size) / GetMaximumSize()); ++countUsage; } if (countUsage > 0) { @@ -92,16 +88,28 @@ double TStorageGroupInfo::GetUsage() const { } } +double TStorageGroupInfo::GetUsage() const { + return GetUsage(AcquiredResources); +} + +double TStorageGroupInfo::GetUsageForChannel(const TLeaderTabletInfo::TChannel& channel) const { + TStorageResources resources = AcquiredResources; + if (!Units.contains(channel)) { + resources.Add(channel); + } + return GetUsage(resources); +} + double TStorageGroupInfo::GetMaximumIOPS() const { - return MaximumIOPS * StoragePool.GetOvercommitIOPS(); + return MaximumResources.IOPS * StoragePool.GetOvercommitIOPS(); } ui64 TStorageGroupInfo::GetMaximumThroughput() const { - return MaximumThroughput * StoragePool.GetOvercommitThroughput(); + return MaximumResources.Throughput * StoragePool.GetOvercommitThroughput(); } ui64 TStorageGroupInfo::GetMaximumSize() const { - return MaximumSize * StoragePool.GetOvercommitSize(); + return MaximumResources.Size * StoragePool.GetOvercommitSize(); } bool TStorageGroupInfo::IsBalanceByIOPS() const { diff --git a/ydb/core/mind/hive/storage_group_info.h b/ydb/core/mind/hive/storage_group_info.h index a520c7e2b949..5a407d18295a 100644 --- a/ydb/core/mind/hive/storage_group_info.h +++ b/ydb/core/mind/hive/storage_group_info.h @@ -15,16 +15,36 @@ struct TGroupFilter { bool PhysicalGroupsOnly = false; }; +struct TStorageResources { + double IOPS = 0; + ui64 Throughput = 0; + ui64 Size = 0; + + void Add(const TLeaderTabletInfo::TChannel& channel) { + IOPS += channel.ChannelInfo->GetIOPS(); + Throughput += channel.ChannelInfo->GetIOPS(); + Size += channel.ChannelInfo->GetSize(); + } + + void Subtract(const TLeaderTabletInfo::TChannel& channel) { + IOPS -= channel.ChannelInfo->GetIOPS(); + Throughput -= channel.ChannelInfo->GetIOPS(); + Size -= channel.ChannelInfo->GetSize(); + } +}; + +struct TChannelHash { + size_t operator ()(const TLeaderTabletInfo::TChannel& channel) const { + return hash_combiner::hash_val(channel.TabletId, channel.ChannelId); + } +}; + struct TStorageGroupInfo { const TStoragePoolInfo& StoragePool; TStorageGroupId Id; - std::unordered_set> Units; // Tablet + Channel - double AcquiredIOPS = 0; - ui64 AcquiredThroughput = 0; - ui64 AcquiredSize = 0; - double MaximumIOPS = 0; - ui64 MaximumThroughput = 0; - ui64 MaximumSize = 0; + std::unordered_set Units; + TStorageResources AcquiredResources; + TStorageResources MaximumResources; NKikimrBlobStorage::TEvControllerSelectGroupsResult::TGroupParameters GroupParameters; TStorageGroupInfo(const TStoragePoolInfo& storagePool, TStorageGroupId id); @@ -32,11 +52,13 @@ struct TStorageGroupInfo { TStorageGroupInfo(TStorageGroupInfo&&) = delete; TStorageGroupInfo& operator =(const TStorageGroupInfo&) = delete; TStorageGroupInfo& operator =(TStorageGroupInfo&&) = delete; - bool AcquireAllocationUnit(const TLeaderTabletInfo* tablet, ui32 channel); - bool ReleaseAllocationUnit(const TLeaderTabletInfo* tablet, ui32 channel); + bool AcquireAllocationUnit(const TLeaderTabletInfo::TChannel& channel); + bool ReleaseAllocationUnit(const TLeaderTabletInfo::TChannel& channel); void UpdateStorageGroup(const TEvControllerSelectGroupsResult::TGroupParameters& groupParameters); bool IsMatchesParameters(const TGroupFilter& filter) const; + double GetUsage(const TStorageResources& resources) const; double GetUsage() const; + double GetUsageForChannel(const TLeaderTabletInfo::TChannel& channel) const; // usage if the channel is reassigned to this group double GetMaximumIOPS() const; ui64 GetMaximumThroughput() const; ui64 GetMaximumSize() const; diff --git a/ydb/core/mind/hive/storage_pool_info.cpp b/ydb/core/mind/hive/storage_pool_info.cpp index 0abc30908902..fe6731d01b75 100644 --- a/ydb/core/mind/hive/storage_pool_info.cpp +++ b/ydb/core/mind/hive/storage_pool_info.cpp @@ -22,11 +22,11 @@ TStorageGroupInfo& TStoragePoolInfo::GetStorageGroup(TStorageGroupId groupId) { } bool TStoragePoolInfo::AcquireAllocationUnit(const TLeaderTabletInfo* tablet, ui32 channel, TStorageGroupId groupId) { - return GetStorageGroup(groupId).AcquireAllocationUnit(tablet, channel); + return GetStorageGroup(groupId).AcquireAllocationUnit(tablet->GetChannel(channel)); } bool TStoragePoolInfo::ReleaseAllocationUnit(const TLeaderTabletInfo* tablet, ui32 channel, TStorageGroupId groupId) { - return GetStorageGroup(groupId).ReleaseAllocationUnit(tablet, channel); + return GetStorageGroup(groupId).ReleaseAllocationUnit(tablet->GetChannel(channel)); } void TStoragePoolInfo::UpdateStorageGroup(TStorageGroupId groupId, const TEvControllerSelectGroupsResult::TGroupParameters& groupParameters) { @@ -182,5 +182,27 @@ TVector TStoragePoolInfo::PullWaitingTablets() { return std::move(TabletsWaiting); } +TStoragePoolInfo::TStats TStoragePoolInfo::GetStats() const { + TStoragePoolInfo::TStats stats; + if (Groups.empty()) { + return stats; + } + using TValue = decltype(Groups)::value_type; + auto [minIt, maxIt] = std::minmax_element(Groups.begin(), Groups.end(), [](const TValue& lhs, const TValue& rhs) { + return lhs.second.GetUsage() < rhs.second.GetUsage(); + }); + stats.MinUsage = minIt->second.GetUsage(); + stats.MaxUsage = maxIt->second.GetUsage(); + stats.MinUsageGroupId = minIt->first; + stats.MaxUsageGroupId = maxIt->first; + if (stats.MaxUsage > 0) { + double minUsageToBalance = Settings->GetMinGroupUsageToBalance(); + double minUsage = std::max(stats.MinUsage, minUsageToBalance); + double maxUsage = std::max(stats.MaxUsage, minUsageToBalance); + stats.Scatter = (maxUsage - minUsage) / maxUsage; + } + return stats; +} + } // NHive } // NKikimr diff --git a/ydb/core/mind/hive/storage_pool_info.h b/ydb/core/mind/hive/storage_pool_info.h index 82aaf482671c..145c819fde6a 100644 --- a/ydb/core/mind/hive/storage_pool_info.h +++ b/ydb/core/mind/hive/storage_pool_info.h @@ -11,6 +11,14 @@ using namespace NKikimrBlobStorage; class THive; struct TStoragePoolInfo { + struct TStats { + double MinUsage; + TStorageGroupId MinUsageGroupId; + double MaxUsage; + TStorageGroupId MaxUsageGroupId; + double Scatter = 0; + }; + THiveSharedSettings* Settings; NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy GetBalanceStrategy() const { @@ -75,6 +83,7 @@ struct TStoragePoolInfo { TVector PullWaitingTablets(); template size_t SelectGroup(const TVector& groupCandidateUsages); + TStats GetStats() const; private: size_t RoundRobinPos = 0; diff --git a/ydb/core/mind/hive/storage_pool_info_ut.cpp b/ydb/core/mind/hive/storage_pool_info_ut.cpp index 7854c6078646..29ea819e33bf 100644 --- a/ydb/core/mind/hive/storage_pool_info_ut.cpp +++ b/ydb/core/mind/hive/storage_pool_info_ut.cpp @@ -94,9 +94,9 @@ Y_UNIT_TEST_SUITE(StoragePool) { ui32 groupId = found->GetGroupID(); groupUnits[groupId]++; groupChannels[channel][groupId]++; - pool.GetStorageGroup(groupId).AcquiredIOPS += unit1.GroupParameters.GetRequiredIOPS(); - pool.GetStorageGroup(groupId).AcquiredThroughput += unit1.GroupParameters.GetRequiredThroughput(); - pool.GetStorageGroup(groupId).AcquiredSize += unit1.GroupParameters.GetRequiredDataSize(); + pool.GetStorageGroup(groupId).AcquiredResources.IOPS += unit1.GroupParameters.GetRequiredIOPS(); + pool.GetStorageGroup(groupId).AcquiredResources.Throughput += unit1.GroupParameters.GetRequiredThroughput(); + pool.GetStorageGroup(groupId).AcquiredResources.Size += unit1.GroupParameters.GetRequiredDataSize(); } if (strategy != NKikimrConfig::THiveConfig::HIVE_STORAGE_SELECT_STRATEGY_ROUND_ROBIN) { @@ -114,9 +114,9 @@ Y_UNIT_TEST_SUITE(StoragePool) { ui32 groupId = found->GetGroupID(); groupUnits[groupId]++; groupChannels[channel][groupId]++; - pool.GetStorageGroup(groupId).AcquiredIOPS += unit1.GroupParameters.GetRequiredIOPS(); - pool.GetStorageGroup(groupId).AcquiredThroughput += unit1.GroupParameters.GetRequiredThroughput(); - pool.GetStorageGroup(groupId).AcquiredSize += unit1.GroupParameters.GetRequiredDataSize(); + pool.GetStorageGroup(groupId).AcquiredResources.IOPS += unit1.GroupParameters.GetRequiredIOPS(); + pool.GetStorageGroup(groupId).AcquiredResources.Throughput += unit1.GroupParameters.GetRequiredThroughput(); + pool.GetStorageGroup(groupId).AcquiredResources.Size += unit1.GroupParameters.GetRequiredDataSize(); } } @@ -183,9 +183,9 @@ Y_UNIT_TEST_SUITE(StoragePool) { UNIT_ASSERT(found); ui32 groupId = found->GetGroupID(); groupUnits[groupId]++; - pool.GetStorageGroup(groupId).AcquiredIOPS += unit1.GroupParameters.GetRequiredIOPS(); - pool.GetStorageGroup(groupId).AcquiredThroughput += unit1.GroupParameters.GetRequiredThroughput(); - pool.GetStorageGroup(groupId).AcquiredSize += unit1.GroupParameters.GetRequiredDataSize(); + pool.GetStorageGroup(groupId).AcquiredResources.IOPS += unit1.GroupParameters.GetRequiredIOPS(); + pool.GetStorageGroup(groupId).AcquiredResources.Throughput += unit1.GroupParameters.GetRequiredThroughput(); + pool.GetStorageGroup(groupId).AcquiredResources.Size += unit1.GroupParameters.GetRequiredDataSize(); } #ifndef _NDEBUG diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index 517a3d16068e..7aa763f01646 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1408,6 +1408,9 @@ message THiveConfig { optional double ObjectImbalanceToBalance = 67 [default = 0.02]; optional EHiveChannelBalanceStrategy ChannelBalanceStrategy = 68 [default = HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM]; optional uint64 MaxChannelHistorySize = 69 [default = 200]; + optional uint64 StorageInfoRefreshFrequency = 70 [default = 600000]; // send a query to BSC every x milliseconds + optional double MinStorageScatterToBalance = 71 [default = 0.5]; + optional double MinGroupUsageToBalance = 72 [default = 0.1]; } message TColumnShardConfig {