Skip to content

Commit aa1663f

Browse files
authored
Handle unhandled exceptions during build index SchemeShard init (#19312)
1 parent 1180c08 commit aa1663f

9 files changed

+254
-122
lines changed

ydb/core/tx/schemeshard/schemeshard__init.cpp

Lines changed: 70 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4561,6 +4561,36 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45614561

45624562
// Read index build
45634563
{
4564+
auto fillBuildInfoSafe = [&](TIndexBuildInfo& buildInfo, const TString& stepName, const auto& fill) {
4565+
try {
4566+
fill(buildInfo);
4567+
} catch (const std::exception& exc) {
4568+
LOG_ERROR_S(ctx, NKikimrServices::BUILD_INDEX,
4569+
"Init " << stepName << " unhandled exception, id#" << buildInfo.Id
4570+
<< " " << TypeName(exc) << ": " << exc.what() << Endl
4571+
<< TBackTrace::FromCurrentException().PrintToString()
4572+
<< ", TIndexBuildInfo: " << buildInfo);
4573+
4574+
// in-memory volatile state:
4575+
buildInfo.IsBroken = true;
4576+
buildInfo.AddIssue(TStringBuilder() << "Init " << stepName << " unhandled exception " << exc.what());
4577+
}
4578+
};
4579+
4580+
auto fillBuildInfoByIdSafe = [&](TIndexBuildId id, const TString& stepName, const auto& fill) {
4581+
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4582+
Y_ASSERT(buildInfoPtr);
4583+
if (!buildInfoPtr) {
4584+
LOG_ERROR_S(ctx, NKikimrServices::BUILD_INDEX,
4585+
"Init " << stepName << " BuildInfo not found: id#" << id);
4586+
return;
4587+
}
4588+
auto& buildInfo = *buildInfoPtr->Get();
4589+
if (!buildInfo.IsBroken) {
4590+
fillBuildInfoSafe(buildInfo, stepName, fill);
4591+
}
4592+
};
4593+
45644594
// read main info
45654595
{
45664596
auto rowset = db.Table<Schema::IndexBuild>().Range().Select();
@@ -4569,17 +4599,21 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45694599
}
45704600

45714601
while (!rowset.EndOfSet()) {
4572-
TIndexBuildInfo::TPtr indexInfo = TIndexBuildInfo::FromRow(rowset);
4573-
4574-
auto [it, emplaced] = Self->IndexBuilds.emplace(indexInfo->Id, indexInfo);
4575-
Y_ABORT_UNLESS(emplaced);
4576-
if (indexInfo->Uid) {
4577-
// TODO(mbkkt) It also should be unique, but we're not sure.
4578-
Y_ASSERT(!Self->IndexBuildsByUid.contains(indexInfo->Uid));
4579-
Self->IndexBuildsByUid[indexInfo->Uid] = indexInfo;
4602+
TIndexBuildInfo::TPtr buildInfo = new TIndexBuildInfo();
4603+
fillBuildInfoSafe(*buildInfo, "IndexBuild", [&](TIndexBuildInfo& buildInfo) {
4604+
TIndexBuildInfo::FillFromRow(rowset, &buildInfo);
4605+
});
4606+
4607+
// Note: broken build are also added to IndexBuilds
4608+
Y_ASSERT(!Self->IndexBuilds.contains(buildInfo->Id));
4609+
Self->IndexBuilds[buildInfo->Id] = buildInfo;
4610+
4611+
if (buildInfo->Uid) {
4612+
Y_ASSERT(!Self->IndexBuildsByUid.contains(buildInfo->Uid));
4613+
Self->IndexBuildsByUid[buildInfo->Uid] = buildInfo;
45804614
}
45814615

4582-
OnComplete.ToProgress(indexInfo->Id);
4616+
OnComplete.ToProgress(buildInfo->Id);
45834617

45844618
if (!rowset.Next()) {
45854619
return false;
@@ -4601,19 +4635,18 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
46014635

46024636
while (!rowset.EndOfSet()) {
46034637
TIndexBuildId id = rowset.GetValue<Schema::KMeansTreeProgress::Id>();
4604-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4605-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found: id# " << id);
4606-
auto& buildInfo = *buildInfoPtr->Get();
4607-
buildInfo.KMeans.Set(
4608-
rowset.GetValue<Schema::KMeansTreeProgress::Level>(),
4609-
rowset.GetValue<Schema::KMeansTreeProgress::ParentBegin>(),
4610-
rowset.GetValue<Schema::KMeansTreeProgress::Parent>(),
4611-
rowset.GetValue<Schema::KMeansTreeProgress::ChildBegin>(),
4612-
rowset.GetValue<Schema::KMeansTreeProgress::Child>(),
4613-
rowset.GetValue<Schema::KMeansTreeProgress::State>(),
4614-
rowset.GetValue<Schema::KMeansTreeProgress::TableSize>()
4615-
);
4616-
buildInfo.Sample.Rows.reserve(buildInfo.KMeans.K * 2);
4638+
fillBuildInfoByIdSafe(id, "KMeansTreeProgress", [&](TIndexBuildInfo& buildInfo) {
4639+
buildInfo.KMeans.Set(
4640+
rowset.GetValue<Schema::KMeansTreeProgress::Level>(),
4641+
rowset.GetValue<Schema::KMeansTreeProgress::ParentBegin>(),
4642+
rowset.GetValue<Schema::KMeansTreeProgress::Parent>(),
4643+
rowset.GetValue<Schema::KMeansTreeProgress::ChildBegin>(),
4644+
rowset.GetValue<Schema::KMeansTreeProgress::Child>(),
4645+
rowset.GetValue<Schema::KMeansTreeProgress::State>(),
4646+
rowset.GetValue<Schema::KMeansTreeProgress::TableSize>()
4647+
);
4648+
buildInfo.Sample.Rows.reserve(buildInfo.KMeans.K * 2);
4649+
});
46174650

46184651
if (!rowset.Next()) {
46194652
return false;
@@ -4632,13 +4665,12 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
46324665
size_t sampleCount = 0;
46334666
while (!rowset.EndOfSet()) {
46344667
TIndexBuildId id = rowset.GetValue<Schema::KMeansTreeSample::Id>();
4635-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4636-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found: id# " << id);
4637-
auto& buildInfo = *buildInfoPtr->Get();
4638-
buildInfo.Sample.Add(
4639-
rowset.GetValue<Schema::KMeansTreeSample::Probability>(),
4640-
rowset.GetValue<Schema::KMeansTreeSample::Data>()
4641-
);
4668+
fillBuildInfoByIdSafe(id, "KMeansTreeSample", [&](TIndexBuildInfo& buildInfo) {
4669+
buildInfo.Sample.Add(
4670+
rowset.GetValue<Schema::KMeansTreeSample::Probability>(),
4671+
rowset.GetValue<Schema::KMeansTreeSample::Data>()
4672+
);
4673+
});
46424674
sampleCount++;
46434675

46444676
if (!rowset.Next()) {
@@ -4660,11 +4692,9 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
46604692

46614693
while (!rowset.EndOfSet()) {
46624694
TIndexBuildId id = rowset.GetValue<Schema::IndexBuildColumns::Id>();
4663-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4664-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found"
4665-
<< ": id# " << id);
4666-
auto& buildInfo = *buildInfoPtr->Get();
4667-
buildInfo.AddIndexColumnInfo(rowset);
4695+
fillBuildInfoByIdSafe(id, "IndexBuildColumns", [&](TIndexBuildInfo& buildInfo) {
4696+
buildInfo.AddIndexColumnInfo(rowset);
4697+
});
46684698

46694699
if (!rowset.Next()) {
46704700
return false;
@@ -4680,11 +4710,9 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
46804710

46814711
while (!rowset.EndOfSet()) {
46824712
TIndexBuildId id = rowset.GetValue<Schema::BuildColumnOperationSettings::Id>();
4683-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4684-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found"
4685-
<< ": id# " << id);
4686-
auto& buildInfo = *buildInfoPtr->Get();
4687-
buildInfo.AddBuildColumnInfo(rowset);
4713+
fillBuildInfoByIdSafe(id, "BuildColumnOperationSettings", [&](TIndexBuildInfo& buildInfo) {
4714+
buildInfo.AddBuildColumnInfo(rowset);
4715+
});
46884716

46894717
if (!rowset.Next()) {
46904718
return false;
@@ -4701,11 +4729,9 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
47014729

47024730
while (!rowset.EndOfSet()) {
47034731
TIndexBuildId id = rowset.GetValue<Schema::IndexBuildShardStatus::Id>();
4704-
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
4705-
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found"
4706-
<< ": id# " << id);
4707-
auto& buildInfo = *buildInfoPtr->Get();
4708-
buildInfo.AddShardStatus(rowset);
4732+
fillBuildInfoByIdSafe(id, "IndexBuildShardStatus", [&](TIndexBuildInfo& buildInfo) {
4733+
buildInfo.AddShardStatus(rowset);
4734+
});
47094735

47104736
if (!rowset.Next()) {
47114737
return false;

ydb/core/tx/schemeshard/schemeshard__monitoring.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,8 @@ struct TSchemeShard::TTxMonitoring : public NTabletFlatExecutor::TTransactionBas
818818
<< "CancelRequested: " << (info.CancelRequested ? "YES" : "NO") << Endl
819819

820820
<< "State: " << info.State << Endl
821-
<< "Issue: " << info.Issue << Endl
821+
<< "IsBroken: " << info.IsBroken << Endl
822+
<< "Issue: " << info.GetIssue() << Endl
822823

823824
<< "Shards.size: " << info.Shards.size() << Endl
824825
<< "ToUploadShards.size: " << info.ToUploadShards.size() << Endl

ydb/core/tx/schemeshard/schemeshard_build_index.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ void TSchemeShard::Handle(TEvPrivate::TEvIndexBuildingMakeABill::TPtr& ev, const
5353
}
5454

5555
void TSchemeShard::PersistCreateBuildIndex(NIceDb::TNiceDb& db, const TIndexBuildInfo& info) {
56-
Y_ABORT_UNLESS(info.BuildKind != TIndexBuildInfo::EBuildKind::BuildKindUnspecified);
56+
Y_ENSURE(info.BuildKind != TIndexBuildInfo::EBuildKind::BuildKindUnspecified);
5757
auto persistedBuildIndex = db.Table<Schema::IndexBuild>().Key(info.Id);
5858
persistedBuildIndex.Update(
5959
NIceDb::TUpdate<Schema::IndexBuild::Uid>(info.Uid),
@@ -126,7 +126,7 @@ void TSchemeShard::PersistCreateBuildIndex(NIceDb::TNiceDb& db, const TIndexBuil
126126
void TSchemeShard::PersistBuildIndexState(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo) {
127127
db.Table<Schema::IndexBuild>().Key(indexInfo.Id).Update(
128128
NIceDb::TUpdate<Schema::IndexBuild::State>(ui32(indexInfo.State)),
129-
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.Issue),
129+
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.GetIssue()),
130130
NIceDb::TUpdate<Schema::IndexBuild::StartTime>(indexInfo.StartTime.Seconds()),
131131
NIceDb::TUpdate<Schema::IndexBuild::EndTime>(indexInfo.EndTime.Seconds())
132132
);
@@ -139,7 +139,7 @@ void TSchemeShard::PersistBuildIndexCancelRequest(NIceDb::TNiceDb& db, const TIn
139139

140140
void TSchemeShard::PersistBuildIndexIssue(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo) {
141141
db.Table<Schema::IndexBuild>().Key(indexInfo.Id).Update(
142-
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.Issue));
142+
NIceDb::TUpdate<Schema::IndexBuild::Issue>(indexInfo.GetIssue()));
143143
}
144144

145145
void TSchemeShard::PersistBuildIndexAlterMainTableTxId(NIceDb::TNiceDb& db, const TIndexBuildInfo& indexInfo) {
@@ -314,9 +314,12 @@ void TSchemeShard::PersistBuildIndexForget(NIceDb::TNiceDb& db, const TIndexBuil
314314

315315
void TSchemeShard::Resume(const TDeque<TIndexBuildId>& indexIds, const TActorContext& ctx) {
316316
for (const auto& id : indexIds) {
317-
if (IndexBuilds.contains(id)) {
318-
Execute(CreateTxProgress(id), ctx);
317+
const auto* buildInfoPtr = IndexBuilds.FindPtr(id);
318+
if (!buildInfoPtr || buildInfoPtr->Get()->IsBroken) {
319+
continue;
319320
}
321+
322+
Execute(CreateTxProgress(id), ctx);
320323
}
321324
}
322325

@@ -331,7 +334,7 @@ void TSchemeShard::SetupRouting(const TDeque<TIndexBuildId>& indexIds, const TAc
331334
auto handle = [&] (auto txId) {
332335
if (txId) {
333336
auto [it, emplaced] = TxIdToIndexBuilds.try_emplace(txId, buildInfo.Id);
334-
Y_ABORT_UNLESS(it->second == buildInfo.Id);
337+
Y_ENSURE(it->second == buildInfo.Id);
335338
}
336339
};
337340

ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ class TSchemeShard::TIndexBuilder::TTxCreate: public TSchemeShard::TIndexBuilder
8282
}
8383
}
8484

85-
TIndexBuildInfo::TPtr buildInfo = new TIndexBuildInfo(BuildId, uid);
85+
TIndexBuildInfo::TPtr buildInfo = new TIndexBuildInfo();
86+
buildInfo->Id = BuildId;
87+
buildInfo->Uid = uid;
8688
buildInfo->DomainPathId = domainPath.Base()->PathId;
8789
buildInfo->TablePathId = tablePath.Base()->PathId;
8890

0 commit comments

Comments
 (0)