From 74a4de3f67734ca3e0e44588fc2352a00e6cfaf2 Mon Sep 17 00:00:00 2001 From: kungasc Date: Thu, 17 Apr 2025 12:47:19 +0300 Subject: [PATCH 1/5] Call FillLocalKMeans for prefixed index explicitly --- ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp | 92 +++++++++++++++++++ .../schemeshard_build_index__progress.cpp | 52 +++++++++-- .../tx/schemeshard/schemeshard_info_types.h | 7 +- 3 files changed, 139 insertions(+), 12 deletions(-) diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp index 0cf0badfe434..3f278cb061a1 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp @@ -3103,6 +3103,98 @@ Y_UNIT_TEST_SUITE(KqpIndexes) { DoPositiveQueriesPrefixedVectorIndexOrderByCosine(session); } + Y_UNIT_TEST(PrefixedVectorIndexOrderByCosineDistanceNotNullableLevel3) { + NKikimrConfig::TFeatureFlags featureFlags; + featureFlags.SetEnableVectorIndex(true); + auto setting = NKikimrKqp::TKqpSetting(); + auto serverSettings = TKikimrSettings() + .SetFeatureFlags(featureFlags) + .SetKqpSettings({setting}); + + TKikimrRunner kikimr(serverSettings); + kikimr.GetTestServer().GetRuntime()->SetLogPriority(NKikimrServices::BUILD_INDEX, NActors::NLog::PRI_TRACE); + kikimr.GetTestServer().GetRuntime()->SetLogPriority(NKikimrServices::FLAT_TX_SCHEMESHARD, NActors::NLog::PRI_TRACE); + + auto db = kikimr.GetTableClient(); + auto session = DoCreateTableForPrefixedVectorIndex(db, false); + { + const TString createIndex(Q_(R"( + ALTER TABLE `/Root/TestTable` + ADD INDEX index + GLOBAL USING vector_kmeans_tree + ON (user, emb) + WITH (distance=cosine, vector_type="uint8", vector_dimension=2, levels=3, clusters=2); + )")); + + auto result = session.ExecuteSchemeQuery(createIndex) + .ExtractValueSync(); + + UNIT_ASSERT_C(result.IsSuccess(), result.GetIssues().ToString()); + } + { + auto result = session.DescribeTable("/Root/TestTable").ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), NYdb::EStatus::SUCCESS); + const auto& indexes = result.GetTableDescription().GetIndexDescriptions(); + UNIT_ASSERT_EQUAL(indexes.size(), 1); + UNIT_ASSERT_EQUAL(indexes[0].GetIndexName(), "index"); + std::vector indexKeyColumns{"user", "emb"}; + UNIT_ASSERT_EQUAL(indexes[0].GetIndexColumns(), indexKeyColumns); + const auto& settings = std::get(indexes[0].GetIndexSettings()); + UNIT_ASSERT_EQUAL(settings.Settings.Metric, NYdb::NTable::TVectorIndexSettings::EMetric::CosineDistance); + UNIT_ASSERT_EQUAL(settings.Settings.VectorType, NYdb::NTable::TVectorIndexSettings::EVectorType::Uint8); + UNIT_ASSERT_EQUAL(settings.Settings.VectorDimension, 2); + UNIT_ASSERT_EQUAL(settings.Levels, 3); + UNIT_ASSERT_EQUAL(settings.Clusters, 2); + } + DoPositiveQueriesPrefixedVectorIndexOrderByCosine(session); + } + + Y_UNIT_TEST(PrefixedVectorIndexOrderByCosineDistanceNotNullableLevel4) { + NKikimrConfig::TFeatureFlags featureFlags; + featureFlags.SetEnableVectorIndex(true); + auto setting = NKikimrKqp::TKqpSetting(); + auto serverSettings = TKikimrSettings() + .SetFeatureFlags(featureFlags) + .SetKqpSettings({setting}); + + TKikimrRunner kikimr(serverSettings); + kikimr.GetTestServer().GetRuntime()->SetLogPriority(NKikimrServices::BUILD_INDEX, NActors::NLog::PRI_TRACE); + kikimr.GetTestServer().GetRuntime()->SetLogPriority(NKikimrServices::FLAT_TX_SCHEMESHARD, NActors::NLog::PRI_TRACE); + + auto db = kikimr.GetTableClient(); + auto session = DoCreateTableForPrefixedVectorIndex(db, false); + { + const TString createIndex(Q_(R"( + ALTER TABLE `/Root/TestTable` + ADD INDEX index + GLOBAL USING vector_kmeans_tree + ON (user, emb) + WITH (distance=cosine, vector_type="uint8", vector_dimension=2, levels=4, clusters=2); + )")); + + auto result = session.ExecuteSchemeQuery(createIndex) + .ExtractValueSync(); + + UNIT_ASSERT_C(result.IsSuccess(), result.GetIssues().ToString()); + } + { + auto result = session.DescribeTable("/Root/TestTable").ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), NYdb::EStatus::SUCCESS); + const auto& indexes = result.GetTableDescription().GetIndexDescriptions(); + UNIT_ASSERT_EQUAL(indexes.size(), 1); + UNIT_ASSERT_EQUAL(indexes[0].GetIndexName(), "index"); + std::vector indexKeyColumns{"user", "emb"}; + UNIT_ASSERT_EQUAL(indexes[0].GetIndexColumns(), indexKeyColumns); + const auto& settings = std::get(indexes[0].GetIndexSettings()); + UNIT_ASSERT_EQUAL(settings.Settings.Metric, NYdb::NTable::TVectorIndexSettings::EMetric::CosineDistance); + UNIT_ASSERT_EQUAL(settings.Settings.VectorType, NYdb::NTable::TVectorIndexSettings::EVectorType::Uint8); + UNIT_ASSERT_EQUAL(settings.Settings.VectorDimension, 2); + UNIT_ASSERT_EQUAL(settings.Levels, 4); + UNIT_ASSERT_EQUAL(settings.Clusters, 2); + } + DoPositiveQueriesPrefixedVectorIndexOrderByCosine(session); + } + Y_UNIT_TEST(PrefixedVectorIndexOrderByCosineSimilarityNotNullableLevel2) { NKikimrConfig::TFeatureFlags featureFlags; featureFlags.SetEnableVectorIndex(true); diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp index d3e13a8d4f19..388e843fa1ad 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp @@ -823,6 +823,14 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil buildInfo.DoneShards.size() == buildInfo.Shards.size(); } + bool FillLocalKMeans(TIndexBuildInfo& buildInfo) { + if (buildInfo.DoneShards.empty() && buildInfo.ToUploadShards.empty() && buildInfo.InProgressShards.empty()) { + AddAllShards(buildInfo); + } + return SendToShards(buildInfo, [&](TShardIdx shardIdx) { SendKMeansLocalRequest(shardIdx, buildInfo); }) && + buildInfo.DoneShards.size() == buildInfo.Shards.size(); + } + bool InitSingleKMeans(TIndexBuildInfo& buildInfo) { if (!buildInfo.DoneShards.empty() || !buildInfo.InProgressShards.empty() || !buildInfo.ToUploadShards.empty()) { return false; @@ -934,42 +942,69 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil ); } + // TODO: extract FillPrefixedVectorIndex bool FillVectorIndex(TTransactionContext& txc, TIndexBuildInfo& buildInfo) { if (buildInfo.IsBuildPrefixedVectorIndex() && buildInfo.KMeans.Level == 1) { - LOG_D("FillIndex::Prefixed::Level1::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillIndex::Prefixed::Level::Start " << buildInfo.KMeansTreeToDebugStr()); if (!FillTable(buildInfo)) { return false; } - const ui64 doneShards = buildInfo.DoneShards.size(); + LOG_D("FillIndex::Prefixed::Level::Done " << buildInfo.KMeansTreeToDebugStr()); + const ui64 doneShards = buildInfo.DoneShards.size(); ClearDoneShards(txc, buildInfo); // it's approximate but upper bound, so it's ok buildInfo.KMeans.TableSize = std::max(1, buildInfo.Processed.GetUploadRows()); buildInfo.KMeans.PrefixIndexDone(doneShards); + LOG_D("FillIndex::Prefixed::NextLevel " << buildInfo.KMeansTreeToDebugStr()); PersistKMeansState(txc, buildInfo); NIceDb::TNiceDb db{txc.DB}; Self->PersistBuildIndexUploadReset(db, buildInfo); - LOG_D("FillIndex::Prefixed::Level1::Done " << buildInfo.KMeansTreeToDebugStr()); ChangeState(BuildId, TIndexBuildInfo::EState::CreateBuild); Progress(BuildId); return false; } if (buildInfo.IsBuildPrefixedVectorIndex() && buildInfo.KMeans.Level == 2) { - LOG_D("FillIndex::Prefixed::Level2::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillIndex::Prefixed::Level::Start " << buildInfo.KMeansTreeToDebugStr()); if (!FillPrefixKMeans(buildInfo)) { return false; } + LOG_D("FillIndex::Prefixed::Level::Done " << buildInfo.KMeansTreeToDebugStr()); + + ClearDoneShards(txc, buildInfo); + Y_ASSERT(buildInfo.KMeans.State == TIndexBuildInfo::TKMeans::MultiLocal); + const bool needsAnotherLevel = buildInfo.KMeans.NextLevel(); + buildInfo.KMeans.State = TIndexBuildInfo::TKMeans::MultiLocal; + buildInfo.KMeans.Parent = buildInfo.KMeans.ParentEnd(); + LOG_D("FillIndex::Prefixed::NextLevel " << buildInfo.KMeansTreeToDebugStr()); + PersistKMeansState(txc, buildInfo); + NIceDb::TNiceDb db{txc.DB}; + Self->PersistBuildIndexUploadReset(db, buildInfo); + if (!needsAnotherLevel) { + return true; + } + ChangeState(BuildId, TIndexBuildInfo::EState::DropBuild); + Progress(BuildId); + return false; + } + + if (buildInfo.IsBuildPrefixedVectorIndex() && buildInfo.KMeans.Level >= 3) { + LOG_D("FillIndex::Prefixed::Level::Start " << buildInfo.KMeansTreeToDebugStr()); + if (!FillLocalKMeans(buildInfo)) { + return false; + } + LOG_D("FillIndex::Prefixed::Level::Done " << buildInfo.KMeansTreeToDebugStr()); ClearDoneShards(txc, buildInfo); Y_ASSERT(buildInfo.KMeans.State == TIndexBuildInfo::TKMeans::MultiLocal); const bool needsAnotherLevel = buildInfo.KMeans.NextLevel(); buildInfo.KMeans.State = TIndexBuildInfo::TKMeans::MultiLocal; buildInfo.KMeans.Parent = buildInfo.KMeans.ParentEnd(); + LOG_D("FillIndex::Prefixed::NextLevel " << buildInfo.KMeansTreeToDebugStr()); PersistKMeansState(txc, buildInfo); NIceDb::TNiceDb db{txc.DB}; Self->PersistBuildIndexUploadReset(db, buildInfo); - LOG_D("FillIndex::Prefixed::Level2::Done " << buildInfo.KMeansTreeToDebugStr()); if (!needsAnotherLevel) { return true; } @@ -1317,15 +1352,16 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil auto tableColumns = NTableIndex::ExtractInfo(table); // skip dropped columns TSerializedTableRange shardRange = InfiniteRange(tableColumns.Keys.size()); static constexpr std::string_view LogPrefix = ""; - LOG_D("infinite range " << buildInfo.KMeans.RangeToDebugStr(shardRange, buildInfo.IsBuildPrefixedVectorIndex() ? 2 : 1)); buildInfo.Cluster2Shards.clear(); for (const auto& x: table->GetPartitions()) { Y_ABORT_UNLESS(Self->ShardInfos.contains(x.ShardIdx)); TSerializedCellVec bound{x.EndOfRange}; shardRange.To = bound; - LOG_D("shard " << x.ShardIdx << " range " << buildInfo.KMeans.RangeToDebugStr(shardRange, buildInfo.IsBuildPrefixedVectorIndex() ? 2 : 1)); - buildInfo.AddParent(shardRange, x.ShardIdx); + if (buildInfo.BuildKind == TIndexBuildInfo::EBuildKind::BuildVectorIndex) { + LOG_D("shard " << x.ShardIdx << " range " << buildInfo.KMeans.RangeToDebugStr(shardRange)); + buildInfo.AddParent(shardRange, x.ShardIdx); + } auto [it, emplaced] = buildInfo.Shards.emplace(x.ShardIdx, TIndexBuildInfo::TShardStatus{std::move(shardRange), "", buildInfo.Shards.size()}); Y_ASSERT(emplaced); shardRange.From = std::move(bound); diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h index 08a66fd56f85..49cf00d1b35f 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.h +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h @@ -3280,7 +3280,7 @@ struct TIndexBuildInfo: public TSimpleRefCount { return {parentFrom, parentTo}; } - TString RangeToDebugStr(const TSerializedTableRange& range, ui32 rootLevel) const { + TString RangeToDebugStr(const TSerializedTableRange& range) const { auto toStr = [&](const TSerializedCellVec& v) -> TString { const auto cells = v.GetCells(); if (cells.empty()) { @@ -3290,7 +3290,7 @@ struct TIndexBuildInfo: public TSimpleRefCount { return "-inf"; } auto str = TStringBuilder{} << "{ count: " << cells.size(); - if (Level > rootLevel) { + if (Level > 1) { str << ", parent: " << cells[0].AsValue(); if (cells.size() != 1 && cells[1].IsNull()) { str << ", pk: null"; @@ -3679,8 +3679,7 @@ struct TIndexBuildInfo: public TSimpleRefCount { TSerializedTableRange bound{range}; LOG_DEBUG_S(TlsActivationContext->AsActorContext(), NKikimrServices::BUILD_INDEX, - "AddShardStatus id# " << Id << " shard " << shardIdx << - " range " << KMeans.RangeToDebugStr(bound, IsBuildPrefixedVectorIndex() ? 2 : 1)); + "AddShardStatus id# " << Id << " shard " << shardIdx); AddParent(bound, shardIdx); Shards.emplace( shardIdx, TIndexBuildInfo::TShardStatus(std::move(bound), std::move(lastKeyAck), Shards.size())); From 10cc24aa041889135f1893825f8e297c71f198ae Mon Sep 17 00:00:00 2001 From: kungasc Date: Thu, 17 Apr 2025 16:04:42 +0300 Subject: [PATCH 2/5] fix AddShardStatus --- ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp | 1 - ydb/core/tx/schemeshard/schemeshard_info_types.h | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp index 388e843fa1ad..51924f07ee91 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp @@ -1000,7 +1000,6 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil Y_ASSERT(buildInfo.KMeans.State == TIndexBuildInfo::TKMeans::MultiLocal); const bool needsAnotherLevel = buildInfo.KMeans.NextLevel(); buildInfo.KMeans.State = TIndexBuildInfo::TKMeans::MultiLocal; - buildInfo.KMeans.Parent = buildInfo.KMeans.ParentEnd(); LOG_D("FillIndex::Prefixed::NextLevel " << buildInfo.KMeansTreeToDebugStr()); PersistKMeansState(txc, buildInfo); NIceDb::TNiceDb db{txc.DB}; diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h index 49cf00d1b35f..5b45c63307ec 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.h +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h @@ -3680,7 +3680,9 @@ struct TIndexBuildInfo: public TSimpleRefCount { TSerializedTableRange bound{range}; LOG_DEBUG_S(TlsActivationContext->AsActorContext(), NKikimrServices::BUILD_INDEX, "AddShardStatus id# " << Id << " shard " << shardIdx); - AddParent(bound, shardIdx); + if (BuildKind == TIndexBuildInfo::EBuildKind::BuildVectorIndex) { + AddParent(bound, shardIdx); + } Shards.emplace( shardIdx, TIndexBuildInfo::TShardStatus(std::move(bound), std::move(lastKeyAck), Shards.size())); TIndexBuildInfo::TShardStatus &shardStatus = Shards.at(shardIdx); From 7994fa359bbfd63aa4c70c64716df88883b9c8f9 Mon Sep 17 00:00:00 2001 From: kungasc Date: Thu, 17 Apr 2025 16:26:35 +0300 Subject: [PATCH 3/5] extract FillPrefixedVectorIndex --- .../schemeshard_build_index__progress.cpp | 70 ++++++++----------- 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp index 51924f07ee91..b0f8664b7ada 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp @@ -807,7 +807,7 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil } } - bool FillTable(TIndexBuildInfo& buildInfo) { + bool FillSecondaryIndex(TIndexBuildInfo& buildInfo) { if (buildInfo.DoneShards.empty() && buildInfo.ToUploadShards.empty() && buildInfo.InProgressShards.empty()) { AddAllShards(buildInfo); } @@ -942,14 +942,15 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil ); } - // TODO: extract FillPrefixedVectorIndex - bool FillVectorIndex(TTransactionContext& txc, TIndexBuildInfo& buildInfo) { - if (buildInfo.IsBuildPrefixedVectorIndex() && buildInfo.KMeans.Level == 1) { - LOG_D("FillIndex::Prefixed::Level::Start " << buildInfo.KMeansTreeToDebugStr()); - if (!FillTable(buildInfo)) { + bool FillPrefixedVectorIndex(TTransactionContext& txc, TIndexBuildInfo& buildInfo) { + Y_ASSERT(buildInfo.IsBuildPrefixedVectorIndex()); + LOG_D("FillPrefixedIndex::Level::Start " << buildInfo.KMeansTreeToDebugStr()); + + if (buildInfo.KMeans.Level == 1) { + if (!FillSecondaryIndex(buildInfo)) { return false; } - LOG_D("FillIndex::Prefixed::Level::Done " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillPrefixedIndex::Level::Done " << buildInfo.KMeansTreeToDebugStr()); const ui64 doneShards = buildInfo.DoneShards.size(); ClearDoneShards(txc, buildInfo); @@ -957,50 +958,29 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil buildInfo.KMeans.TableSize = std::max(1, buildInfo.Processed.GetUploadRows()); buildInfo.KMeans.PrefixIndexDone(doneShards); LOG_D("FillIndex::Prefixed::NextLevel " << buildInfo.KMeansTreeToDebugStr()); + PersistKMeansState(txc, buildInfo); NIceDb::TNiceDb db{txc.DB}; Self->PersistBuildIndexUploadReset(db, buildInfo); ChangeState(BuildId, TIndexBuildInfo::EState::CreateBuild); Progress(BuildId); return false; - } - - if (buildInfo.IsBuildPrefixedVectorIndex() && buildInfo.KMeans.Level == 2) { - LOG_D("FillIndex::Prefixed::Level::Start " << buildInfo.KMeansTreeToDebugStr()); - if (!FillPrefixKMeans(buildInfo)) { + } else { + bool filled = buildInfo.KMeans.Level == 2 + ? FillPrefixKMeans(buildInfo) + : FillLocalKMeans(buildInfo); + if (!filled) { return false; } - LOG_D("FillIndex::Prefixed::Level::Done " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillPrefixedIndex::Level::Done " << buildInfo.KMeansTreeToDebugStr()); ClearDoneShards(txc, buildInfo); Y_ASSERT(buildInfo.KMeans.State == TIndexBuildInfo::TKMeans::MultiLocal); const bool needsAnotherLevel = buildInfo.KMeans.NextLevel(); buildInfo.KMeans.State = TIndexBuildInfo::TKMeans::MultiLocal; buildInfo.KMeans.Parent = buildInfo.KMeans.ParentEnd(); - LOG_D("FillIndex::Prefixed::NextLevel " << buildInfo.KMeansTreeToDebugStr()); - PersistKMeansState(txc, buildInfo); - NIceDb::TNiceDb db{txc.DB}; - Self->PersistBuildIndexUploadReset(db, buildInfo); - if (!needsAnotherLevel) { - return true; - } - ChangeState(BuildId, TIndexBuildInfo::EState::DropBuild); - Progress(BuildId); - return false; - } - - if (buildInfo.IsBuildPrefixedVectorIndex() && buildInfo.KMeans.Level >= 3) { - LOG_D("FillIndex::Prefixed::Level::Start " << buildInfo.KMeansTreeToDebugStr()); - if (!FillLocalKMeans(buildInfo)) { - return false; - } - LOG_D("FillIndex::Prefixed::Level::Done " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillPrefixedIndex::NextLevel " << buildInfo.KMeansTreeToDebugStr()); - ClearDoneShards(txc, buildInfo); - Y_ASSERT(buildInfo.KMeans.State == TIndexBuildInfo::TKMeans::MultiLocal); - const bool needsAnotherLevel = buildInfo.KMeans.NextLevel(); - buildInfo.KMeans.State = TIndexBuildInfo::TKMeans::MultiLocal; - LOG_D("FillIndex::Prefixed::NextLevel " << buildInfo.KMeansTreeToDebugStr()); PersistKMeansState(txc, buildInfo); NIceDb::TNiceDb db{txc.DB}; Self->PersistBuildIndexUploadReset(db, buildInfo); @@ -1011,7 +991,9 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil Progress(BuildId); return false; } + } + bool FillVectorIndex(TTransactionContext& txc, TIndexBuildInfo& buildInfo) { if (buildInfo.Sample.State == TIndexBuildInfo::TSample::EState::Upload) { return false; } @@ -1095,11 +1077,17 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil NIceDb::TNiceDb db(txc.DB); InitiateShards(db, buildInfo); } - if (buildInfo.IsBuildVectorIndex()) { - return FillVectorIndex(txc, buildInfo); - } else { - Y_ASSERT(buildInfo.IsBuildSecondaryIndex() || buildInfo.IsBuildColumns()); - return FillTable(buildInfo); + switch (buildInfo.BuildKind) { + case TIndexBuildInfo::EBuildKind::BuildSecondaryIndex: + case TIndexBuildInfo::EBuildKind::BuildColumns: + return FillSecondaryIndex(buildInfo); + case TIndexBuildInfo::EBuildKind::BuildVectorIndex: + return FillVectorIndex(txc, buildInfo); + case TIndexBuildInfo::EBuildKind::BuildPrefixedVectorIndex: + return FillPrefixedVectorIndex(txc, buildInfo); + default: + Y_ASSERT(false); + return true; } } From a84842147257c2ae920858caa38141bbcf2eab14 Mon Sep 17 00:00:00 2001 From: kungasc Date: Thu, 17 Apr 2025 17:01:22 +0300 Subject: [PATCH 4/5] better logs --- .../schemeshard_build_index__progress.cpp | 51 +++++++++++-------- .../tx/schemeshard/schemeshard_info_types.h | 30 ++++++----- 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp index b0f8664b7ada..e2291e80b52f 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp @@ -370,7 +370,7 @@ THolder CreateBuildPropose( static constexpr std::string_view LogPrefix = "Create build table boundaries for "; LOG_D(buildInfo.Id << " table " << suffix << ", count: " << count << ", parts: " << parts << ", step: " << step - << ", kmeans: " << buildInfo.KMeansTreeToDebugStr()); + << ", " << buildInfo.DebugString()); if (parts > 1) { const auto from = buildInfo.KMeans.ChildBegin; for (auto i = from + step, e = from + count; i < e; i += step) { @@ -699,7 +699,7 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil ToTabletSend.emplace_back(shardId, ui64(BuildId), std::move(ev)); } - void SendBuildIndexRequest(TShardIdx shardIdx, TIndexBuildInfo& buildInfo) { + void SendBuildSecondaryIndexRequest(TShardIdx shardIdx, TIndexBuildInfo& buildInfo) { auto ev = MakeHolder(); ev->Record.SetBuildIndexId(ui64(BuildId)); @@ -808,11 +808,19 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil } bool FillSecondaryIndex(TIndexBuildInfo& buildInfo) { + LOG_D("FillSecondaryIndex Start"); + if (buildInfo.DoneShards.empty() && buildInfo.ToUploadShards.empty() && buildInfo.InProgressShards.empty()) { AddAllShards(buildInfo); } - return SendToShards(buildInfo, [&](TShardIdx shardIdx) { SendBuildIndexRequest(shardIdx, buildInfo); }) && + auto done = SendToShards(buildInfo, [&](TShardIdx shardIdx) { SendBuildSecondaryIndexRequest(shardIdx, buildInfo); }) && buildInfo.DoneShards.size() == buildInfo.Shards.size(); + + if (done) { + LOG_D("FillSecondaryIndex Done"); + } + + return done; } bool FillPrefixKMeans(TIndexBuildInfo& buildInfo) { @@ -943,21 +951,20 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil } bool FillPrefixedVectorIndex(TTransactionContext& txc, TIndexBuildInfo& buildInfo) { - Y_ASSERT(buildInfo.IsBuildPrefixedVectorIndex()); - LOG_D("FillPrefixedIndex::Level::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillPrefixedVectorIndex Start " << buildInfo.DebugString()); if (buildInfo.KMeans.Level == 1) { if (!FillSecondaryIndex(buildInfo)) { return false; } - LOG_D("FillPrefixedIndex::Level::Done " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillPrefixedVectorIndex DoneLevel " << buildInfo.DebugString()); const ui64 doneShards = buildInfo.DoneShards.size(); ClearDoneShards(txc, buildInfo); // it's approximate but upper bound, so it's ok buildInfo.KMeans.TableSize = std::max(1, buildInfo.Processed.GetUploadRows()); buildInfo.KMeans.PrefixIndexDone(doneShards); - LOG_D("FillIndex::Prefixed::NextLevel " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillPrefixedVectorIndex PrefixIndexDone " << buildInfo.DebugString()); PersistKMeansState(txc, buildInfo); NIceDb::TNiceDb db{txc.DB}; @@ -972,19 +979,20 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil if (!filled) { return false; } - LOG_D("FillPrefixedIndex::Level::Done " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillPrefixedVectorIndex DoneLevel " << buildInfo.DebugString()); ClearDoneShards(txc, buildInfo); Y_ASSERT(buildInfo.KMeans.State == TIndexBuildInfo::TKMeans::MultiLocal); const bool needsAnotherLevel = buildInfo.KMeans.NextLevel(); buildInfo.KMeans.State = TIndexBuildInfo::TKMeans::MultiLocal; buildInfo.KMeans.Parent = buildInfo.KMeans.ParentEnd(); - LOG_D("FillPrefixedIndex::NextLevel " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillPrefixedVectorIndex NextLevel " << buildInfo.DebugString()); PersistKMeansState(txc, buildInfo); NIceDb::TNiceDb db{txc.DB}; Self->PersistBuildIndexUploadReset(db, buildInfo); if (!needsAnotherLevel) { + LOG_D("FillPrefixedVectorIndex Done " << buildInfo.DebugString()); return true; } ChangeState(BuildId, TIndexBuildInfo::EState::DropBuild); @@ -994,32 +1002,32 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil } bool FillVectorIndex(TTransactionContext& txc, TIndexBuildInfo& buildInfo) { + LOG_D("FillVectorIndex Start " << buildInfo.DebugString()); + if (buildInfo.Sample.State == TIndexBuildInfo::TSample::EState::Upload) { return false; } if (InitSingleKMeans(buildInfo)) { - LOG_D("FillIndex::SingleKMeans::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex SingleKMeans " << buildInfo.DebugString()); } if (!SendVectorIndex(buildInfo)) { return false; } - LOG_D("FillIndex::SendVectorIndex::Done " << buildInfo.KMeansTreeToDebugStr()); if (!buildInfo.Sample.Rows.empty()) { if (buildInfo.Sample.State == TIndexBuildInfo::TSample::EState::Collect) { - LOG_D("FillIndex::SendSample::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex SendUploadSampleKRequest " << buildInfo.DebugString()); SendUploadSampleKRequest(buildInfo); return false; } - LOG_D("FillIndex::SendSample::Done " << buildInfo.KMeansTreeToDebugStr()); } - LOG_D("FillIndex::ClearDoneShards " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex DoneLevel " << buildInfo.DebugString()); ClearDoneShards(txc, buildInfo); if (!buildInfo.Sample.Rows.empty()) { if (buildInfo.KMeans.NextState()) { - LOG_D("FillIndex::NextState::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex NextState " << buildInfo.DebugString()); PersistKMeansState(txc, buildInfo); Progress(BuildId); return false; @@ -1027,25 +1035,25 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil buildInfo.Sample.Clear(); NIceDb::TNiceDb db{txc.DB}; Self->PersistBuildIndexSampleForget(db, buildInfo); - LOG_D("FillIndex::NextState::Done " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex DoneState " << buildInfo.DebugString()); } if (buildInfo.KMeans.NextParent()) { - LOG_D("FillIndex::NextParent::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex NextParent " << buildInfo.DebugString()); PersistKMeansState(txc, buildInfo); Progress(BuildId); return false; } if (InitMultiKMeans(buildInfo)) { - LOG_D("FillIndex::MultiKMeans::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex MultiKMeans " << buildInfo.DebugString()); PersistKMeansState(txc, buildInfo); Progress(BuildId); return false; } if (buildInfo.KMeans.NextLevel()) { - LOG_D("FillIndex::NextLevel::Start " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex NextLevel " << buildInfo.DebugString()); PersistKMeansState(txc, buildInfo); NIceDb::TNiceDb db{txc.DB}; Self->PersistBuildIndexUploadReset(db, buildInfo); @@ -1055,7 +1063,7 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil Progress(BuildId); return false; } - LOG_D("FillIndex::Done " << buildInfo.KMeansTreeToDebugStr()); + LOG_D("FillVectorIndex Done " << buildInfo.DebugString()); return true; } @@ -1073,7 +1081,6 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil Y_ABORT_UNLESS(buildInfo.SnapshotStep); } if (buildInfo.Shards.empty()) { - LOG_D("FillIndex::InitiateShards " << buildInfo.KMeansTreeToDebugStr()); NIceDb::TNiceDb db(txc.DB); InitiateShards(db, buildInfo); } @@ -1324,6 +1331,8 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil } void InitiateShards(NIceDb::TNiceDb& db, TIndexBuildInfo& buildInfo) { + LOG_D("InitiateShards " << buildInfo.DebugString()); + Y_ASSERT(buildInfo.Shards.empty()); Y_ASSERT(buildInfo.ToUploadShards.empty()); Y_ASSERT(buildInfo.InProgressShards.empty()); diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h index 5b45c63307ec..e0d653bf7ab7 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.h +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h @@ -3153,7 +3153,7 @@ struct TIndexBuildInfo: public TSimpleRefCount { return ParentCount() * K; } - TString ToStr() const { + TString DebugString() const { return TStringBuilder() << "{ K = " << K << ", Level = " << Level << " / " << Levels @@ -3274,9 +3274,9 @@ struct TIndexBuildInfo: public TSimpleRefCount { } return maxParent; }(); - Y_VERIFY_DEBUG_S(minParent <= parentFrom, "minParent(" << minParent << ") > parentFrom(" << parentFrom << ") " << ToStr()); - Y_VERIFY_DEBUG_S(parentFrom <= parentTo, "parentFrom(" << parentFrom << ") > parentTo(" << parentTo << ") " << ToStr()); - Y_VERIFY_DEBUG_S(parentTo <= maxParent, "parentTo(" << parentTo << ") > maxParent(" << maxParent << ") " << ToStr()); + Y_VERIFY_DEBUG_S(minParent <= parentFrom, "minParent(" << minParent << ") > parentFrom(" << parentFrom << ") " << DebugString()); + Y_VERIFY_DEBUG_S(parentFrom <= parentTo, "parentFrom(" << parentFrom << ") > parentTo(" << parentTo << ") " << DebugString()); + Y_VERIFY_DEBUG_S(parentTo <= maxParent, "parentTo(" << parentTo << ") > maxParent(" << maxParent << ") " << DebugString()); return {parentFrom, parentTo}; } @@ -3465,14 +3465,20 @@ struct TIndexBuildInfo: public TSimpleRefCount { }; TSample Sample; - TString KMeansTreeToDebugStr() const { - return TStringBuilder() - << KMeans.ToStr() << ", " - << "{ Rows = " << Sample.Rows.size() - << ", Sample = " << Sample.State << " }, " - << "{ Done = " << DoneShards.size() - << ", ToUpload = " << ToUploadShards.size() - << ", InProgress = " << InProgressShards.size() << " }"; + TString DebugString() const { + auto result = TStringBuilder() << BuildKind; + + if (IsBuildVectorIndex()) { + result << " " + << KMeans.DebugString() << ", " + << "{ Rows = " << Sample.Rows.size() + << ", Sample = " << Sample.State << " }, " + << "{ Done = " << DoneShards.size() + << ", ToUpload = " << ToUploadShards.size() + << ", InProgress = " << InProgressShards.size() << " }"; + } + + return result; } struct TClusterShards { From 5a1e70f86b8acd94a279b914ee664ee3508074d7 Mon Sep 17 00:00:00 2001 From: kungasc Date: Thu, 17 Apr 2025 17:06:46 +0300 Subject: [PATCH 5/5] fix --- ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp index e2291e80b52f..7e2df90002e8 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp @@ -985,7 +985,9 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil Y_ASSERT(buildInfo.KMeans.State == TIndexBuildInfo::TKMeans::MultiLocal); const bool needsAnotherLevel = buildInfo.KMeans.NextLevel(); buildInfo.KMeans.State = TIndexBuildInfo::TKMeans::MultiLocal; - buildInfo.KMeans.Parent = buildInfo.KMeans.ParentEnd(); + if (buildInfo.KMeans.Level == 2) { + buildInfo.KMeans.Parent = buildInfo.KMeans.ParentEnd(); + } LOG_D("FillPrefixedVectorIndex NextLevel " << buildInfo.DebugString()); PersistKMeansState(txc, buildInfo);