Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: hook pod gone away may block new deployment #1482

Merged
Merged
5 changes: 2 additions & 3 deletions operator/pkg/controllers/bkapp/deploy_action.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,8 @@ func (r *DeployActionReconciler) Reconcile(ctx context.Context, bkapp *paasv1alp
// If this is not the initial deploy action, check if there is any preceding running hooks,
// wait for these hooks by return an error to delay for another reconcile cycle.
//
// TODO: Should we remove this logic and allow every new deploy action to start even the
// hook triggered by older deploy is not finished yet?
if bkapp.Status.DeployId != "" {
// If hook already turned off, validateNoRunningHooks will not execute (disregarding previous hooks).
narasux marked this conversation as resolved.
Show resolved Hide resolved
if bkapp.Spec.Hooks != nil && bkapp.Status.DeployId != "" {
if err = r.validateNoRunningHooks(ctx, bkapp); err != nil {
return r.Result.WithError(err)
}
Expand Down
42 changes: 29 additions & 13 deletions operator/pkg/controllers/bkapp/hooks/hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func (r *HookReconciler) Reconcile(ctx context.Context, bkapp *paasv1alpha2.BkAp

log.V(1).Info("handling pre-release-hook reconciliation")
if current.Pod != nil {
if err := r.UpdateStatus(ctx, bkapp, current, hookres.HookExecuteTimeoutThreshold); err != nil {
if err := r.UpdateStatus(bkapp, current, hookres.HookExecuteTimeoutThreshold); err != nil {
return r.Result.WithError(err)
}

Expand All @@ -87,7 +87,6 @@ func (r *HookReconciler) Reconcile(ctx context.Context, bkapp *paasv1alpha2.BkAp
return r.Result.End()
case current.Progressing():
// 当 Hook 执行成功或失败时会由 owned pod 触发新的调和循环, 因此只需要通过 Requeue 处理超时事件即可

return r.Result.Requeue(hookres.HookExecuteTimeoutThreshold)
case current.Succeeded():
return r.Result
Expand Down Expand Up @@ -139,11 +138,18 @@ func (r *HookReconciler) Reconcile(ctx context.Context, bkapp *paasv1alpha2.BkAp
// 获取应用当前在集群中的状态
func (r *HookReconciler) getCurrentState(ctx context.Context, bkapp *paasv1alpha2.BkApp) hookres.HookInstance {
pod := corev1.Pod{}
err := r.Client.Get(ctx, types.NamespacedName{Name: names.PreReleaseHook(bkapp), Namespace: bkapp.Namespace}, &pod)
if err != nil {
key := types.NamespacedName{Namespace: bkapp.Namespace, Name: names.PreReleaseHook(bkapp)}
if err := r.Client.Get(ctx, key, &pod); err != nil {
return hookres.HookInstance{
Pod: nil,
Status: nil,
Pod: nil,
Status: &paasv1alpha2.HookStatus{
Type: paasv1alpha2.HookPreRelease,
Started: lo.ToPtr(false),
StartTime: nil,
Phase: paasv1alpha2.HealthUnknown,
Reason: "Failed",
Message: "PreReleaseHook not found",
narasux marked this conversation as resolved.
Show resolved Hide resolved
},
}
}

Expand Down Expand Up @@ -213,7 +219,6 @@ func (r *HookReconciler) ExecuteHook(

// UpdateStatus will update bkapp hook status from the given instance status
func (r *HookReconciler) UpdateStatus(
ctx context.Context,
bkapp *paasv1alpha2.BkApp,
instance hookres.HookInstance,
timeoutThreshold time.Duration,
Expand All @@ -228,6 +233,20 @@ func (r *HookReconciler) UpdateStatus(
observedGeneration = bkapp.Generation
}

// 若 Hook Pod 不存在,则应该判定 Hook 执行失败
if instance.Pod == nil {
bkapp.Status.Phase = paasv1alpha2.AppFailed
apimeta.SetStatusCondition(&bkapp.Status.Conditions, metav1.Condition{
Type: paasv1alpha2.HooksFinished,
Status: metav1.ConditionFalse,
Reason: instance.Status.Reason,
Message: instance.Status.Message,
ObservedGeneration: observedGeneration,
})
r.updateAppProgressingStatus(bkapp, metav1.ConditionFalse)
return nil
}

switch {
case instance.TimeoutExceededProgressing(timeoutThreshold):
bkapp.Status.Phase = paasv1alpha2.AppFailed
Expand Down Expand Up @@ -325,21 +344,18 @@ func CheckAndUpdatePreReleaseHookStatus(
r := NewHookReconciler(cli)
instance := r.getCurrentState(ctx, bkapp)

if instance.Pod == nil {
return false, errors.New("pre-release-hook not found")
}

if err = r.UpdateStatus(ctx, bkapp, instance, timeout); err != nil {
if err = r.UpdateStatus(bkapp, instance, timeout); err != nil {
return false, err
}

narasux marked this conversation as resolved.
Show resolved Hide resolved
switch {
// 删除超时的 pod
// 删除超时的 Pod
case instance.TimeoutExceededProgressing(timeout):
if err = cli.Delete(ctx, instance.Pod); err != nil {
return false, err
}
return false, errors.WithStack(hookres.ErrExecuteTimeout)
// 若 instance.Pod 为 nil,会判定为失败
case instance.Failed():
return false, errors.Wrapf(
hookres.ErrPodEndsUnsuccessfully,
Expand Down
4 changes: 3 additions & 1 deletion operator/pkg/controllers/bkapp/hooks/hooks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,9 @@ var _ = Describe("Test HookReconciler", func() {
)

Expect(finished).To(BeFalse())
Expect(err.Error()).To(Equal("pre-release-hook not found"))
narasux marked this conversation as resolved.
Show resolved Hide resolved
Expect(
err.Error(),
).To(Equal("hook failed with: PreReleaseHook not found: pod ends unsuccessfully"))
})

It("Pod Execute Failed", func() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func BuildProcDeployment(app *paasv1alpha2.BkApp, procName string) (*appsv1.Depl
}
annotations := map[string]string{
paasv1alpha2.DeployIDAnnoKey: deployID,
paasv1alpha2.LastSyncedSerializedBkAppAnnoKey: string(bkAppJson),
paasv1alpha2.LastSyncedSerializedBkAppAnnoKey: bkAppJson,
}

deployment := &appsv1.Deployment{
Expand Down