Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix] fix bug that replica can not be repaired duo to DECOMMISSION state #9424

Merged
merged 3 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1163,6 +1163,8 @@ public String toString() {
sb.append(". to backend: ").append(destBackendId);
sb.append(", dest path hash: ").append(destPathHash);
}
sb.append(", visible version: ").append(visibleVersion);
sb.append(", committed version: ").append(committedVersion);
if (errMsg != null) {
sb.append(". err: ").append(errMsg);
}
Expand All @@ -1184,4 +1186,26 @@ public int compare(Replica r1, Replica r2) {
}
}
}

/**
* call this when releaseTabletCtx()
*/
public void resetReplicaState() {
if (tablet != null) {
for (Replica replica : tablet.getReplicas()) {
// To address issue: https://github.com/apache/incubator-doris/issues/9422
// the DECOMMISSION state is set in TabletScheduler and not persist to meta.
// So it is reasonable to reset this state if we failed to scheduler this tablet.
// That is, if the TabletScheduler cannot process the tablet, then it should reset
// any intermediate state it set during the scheduling process.
if (replica.getState() == ReplicaState.DECOMMISSION) {
replica.setState(ReplicaState.NORMAL);
replica.setWatermarkTxnId(-1);
LOG.debug("reset replica {} on backend {} of tablet {} state from DECOMMISSION to NORMAL",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LOG.warn() is better,resetReplicaState will not be called frequently, so this log will not be too many, but we have to known which tablet is reset, to find out why it is be like this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may print a lot log. And actually, this is not an error. It is a common case.

replica.getId(), replica.getBackendId(), tabletId);
}
}
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,12 @@ private void schedulePendingTablets() {
if (tabletCtx.getType() == Type.BALANCE) {
// if balance is disabled, remove this tablet
if (Config.disable_balance) {
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED,
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getStatus(),
"disable balance and " + e.getMessage());
} else {
// remove the balance task if it fails to be scheduled many times
if (tabletCtx.getFailedSchedCounter() > 10) {
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED,
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getStatus(),
"schedule failed too many times and " + e.getMessage());
} else {
// we must release resource it current hold, and be scheduled again
Expand All @@ -427,19 +427,19 @@ private void schedulePendingTablets() {
} else if (e.getStatus() == Status.FINISHED) {
// schedule redundant tablet or scheduler disabled will throw this exception
stat.counterTabletScheduledSucceeded.incrementAndGet();
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.FINISHED, e.getMessage());
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.FINISHED, e.getStatus(), e.getMessage());
} else {
Preconditions.checkState(e.getStatus() == Status.UNRECOVERABLE, e.getStatus());
// discard
stat.counterTabletScheduledDiscard.incrementAndGet();
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getMessage());
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getStatus(), e.getMessage());
}
continue;
} catch (Exception e) {
LOG.warn("got unexpected exception, discard this schedule. tablet: {}",
tabletCtx.getTabletId(), e);
stat.counterTabletScheduledFailed.incrementAndGet();
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, e.getMessage());
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, Status.UNRECOVERABLE, e.getMessage());
continue;
}

Expand Down Expand Up @@ -548,7 +548,8 @@ private void scheduleTablet(TabletSchedCtx tabletCtx, AgentBatchTask batchTask)
for (TransactionState transactionState : dbTransactionMgr.getPreCommittedTxnList()) {
if(transactionState.getTableIdList().contains(tbl.getId())) {
// If table releate to transaction with precommitted status, do not allow to do balance.
throw new SchedException(Status.UNRECOVERABLE, "There exists PRECOMMITTED transaction releated to table");
throw new SchedException(Status.UNRECOVERABLE,
"There exists PRECOMMITTED transaction related to table");
}
}
} catch (AnalysisException e) {
Expand Down Expand Up @@ -577,7 +578,7 @@ private void scheduleTablet(TabletSchedCtx tabletCtx, AgentBatchTask batchTask)
throw new SchedException(Status.UNRECOVERABLE, "tablet is unhealthy when doing balance");
}

// for disk balance more accutely, we only schedule tablet when has lastly stat info about disk
// for disk balance more accurately, we only schedule tablet when has lastly stat info about disk
if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE &&
tabletCtx.getBalanceType() == TabletSchedCtx.BalanceType.DISK_BALANCE) {
checkDiskBalanceLastSuccTime(tabletCtx.getTempSrcBackendId(), tabletCtx.getTempSrcPathHash());
Expand Down Expand Up @@ -1093,7 +1094,6 @@ private void handleReplicaTooSlow(TabletSchedCtx tabletCtx) throws SchedExceptio
}

private void deleteReplicaInternal(TabletSchedCtx tabletCtx, Replica replica, String reason, boolean force) throws SchedException {

/*
* Before deleting a replica, we should make sure that there is no running txn on it and no more txns will be on it.
* So we do followings:
Expand All @@ -1109,6 +1109,8 @@ private void deleteReplicaInternal(TabletSchedCtx tabletCtx, Replica replica, St
replica.setState(ReplicaState.DECOMMISSION);
// set priority to normal because it may wait for a long time. Remain it as VERY_HIGH may block other task.
tabletCtx.setOrigPriority(Priority.NORMAL);
LOG.debug("set replica {} on backend {} of tablet {} state to DECOMMISSION",
replica.getId(), replica.getBackendId(), tabletCtx.getTabletId());
throw new SchedException(Status.SCHEDULE_FAILED, "set watermark txn " + nextTxnId);
} else if (replica.getState() == ReplicaState.DECOMMISSION && replica.getWatermarkTxnId() != -1) {
long watermarkTxnId = replica.getWatermarkTxnId();
Expand Down Expand Up @@ -1389,17 +1391,20 @@ private void dynamicAdjustPrioAndAddBackToPendingTablets(TabletSchedCtx tabletCt
addTablet(tabletCtx, true /* force */);
}

private void finalizeTabletCtx(TabletSchedCtx tabletCtx, TabletSchedCtx.State state, String reason) {
private void finalizeTabletCtx(TabletSchedCtx tabletCtx, TabletSchedCtx.State state, Status status, String reason) {
// use 2 steps to avoid nested database lock and synchronized.(releaseTabletCtx() may hold db lock)
// remove the tablet ctx, so that no other process can see it
removeTabletCtx(tabletCtx, reason);
// release resources taken by tablet ctx
releaseTabletCtx(tabletCtx, state);
releaseTabletCtx(tabletCtx, state, status == Status.UNRECOVERABLE);
}

private void releaseTabletCtx(TabletSchedCtx tabletCtx, TabletSchedCtx.State state) {
private void releaseTabletCtx(TabletSchedCtx tabletCtx, TabletSchedCtx.State state, boolean resetReplicaState) {
tabletCtx.setState(state);
tabletCtx.releaseResource(this);
if (resetReplicaState) {
tabletCtx.resetReplicaState();
}
tabletCtx.setFinishedTime(System.currentTimeMillis());
}

Expand Down Expand Up @@ -1454,7 +1459,7 @@ public boolean finishStorageMediaMigrationTask(StorageMediaMigrationTask migrati
updateDiskBalanceLastSuccTime(tabletCtx.getDestBackendId(), tabletCtx.getDestPathHash());
}
// we need this function to free slot for this migration task
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.FINISHED, "finished");
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.FINISHED, Status.FINISHED, "finished");
return true;
}
/**
Expand Down Expand Up @@ -1487,25 +1492,25 @@ public boolean finishCloneTask(CloneTask cloneTask, TFinishTaskRequest request)
} else if (e.getStatus() == Status.UNRECOVERABLE) {
// unrecoverable
stat.counterTabletScheduledDiscard.incrementAndGet();
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getMessage());
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getStatus(), e.getMessage());
return true;
} else if (e.getStatus() == Status.FINISHED) {
// tablet is already healthy, just remove
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getMessage());
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getStatus(), e.getMessage());
return true;
}
} catch (Exception e) {
LOG.warn("got unexpected exception when finish clone task. tablet: {}",
tabletCtx.getTabletId(), e);
stat.counterTabletScheduledDiscard.incrementAndGet();
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, e.getMessage());
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, Status.UNRECOVERABLE, e.getMessage());
return true;
}

Preconditions.checkState(tabletCtx.getState() == TabletSchedCtx.State.FINISHED);
stat.counterCloneTaskSucceeded.incrementAndGet();
gatherStatistics(tabletCtx);
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.FINISHED, "finished");
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.FINISHED, Status.FINISHED, "finished");
return true;
}

Expand Down Expand Up @@ -1569,7 +1574,10 @@ public void handleRunningTablets() {

// 2. release ctx
timeoutTablets.stream().forEach(t -> {
releaseTabletCtx(t, TabletSchedCtx.State.CANCELLED);
// Set "resetReplicaState" to true because
// the timeout task should also be considered as UNRECOVERABLE,
// so need to reset replica state.
releaseTabletCtx(t, TabletSchedCtx.State.CANCELLED, true);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is better to add a log here, it is useful for us to find why resetReplicaState is called. This log will be not too many.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need, this is an origin logic, and we don't expect this log in the past.

stat.counterCloneTaskTimeout.incrementAndGet();
});
}
Expand Down