-
Notifications
You must be signed in to change notification settings - Fork 55
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Introduce timeout for hanging workspaces #605
Changes from all commits
14dbb31
bc0d35a
cda34d6
9479380
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,6 +59,10 @@ import ( | |
dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" | ||
) | ||
|
||
const ( | ||
startingWorkspaceRequeueInterval = 5 * time.Second | ||
) | ||
|
||
// DevWorkspaceReconciler reconciles a DevWorkspace object | ||
type DevWorkspaceReconciler struct { | ||
client.Client | ||
|
@@ -134,7 +138,11 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
// If debug annotation is present, leave the deployment in place to let users | ||
// view logs. | ||
if workspace.Annotations[constants.DevWorkspaceDebugStartAnnotation] == "true" { | ||
return reconcile.Result{}, nil | ||
if isTimeout, err := checkForFailingTimeout(workspace); err != nil { | ||
return reconcile.Result{}, err | ||
} else if !isTimeout { | ||
return reconcile.Result{}, nil | ||
} | ||
} | ||
|
||
patch := []byte(`{"spec":{"started": false}}`) | ||
|
@@ -181,8 +189,16 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
clusterWorkspace := workspace.DeepCopy() | ||
timingInfo := map[string]string{} | ||
timing.SetTime(timingInfo, timing.DevWorkspaceStarted) | ||
|
||
defer func() (reconcile.Result, error) { | ||
r.syncTimingToCluster(ctx, clusterWorkspace, timingInfo, reqLogger) | ||
// Don't accidentally suppress errors by overwriting here; only check for timeout when no error | ||
// encountered in main reconcile loop. | ||
if err == nil { | ||
if timeoutErr := checkForStartTimeout(clusterWorkspace); timeoutErr != nil { | ||
reconcileResult, err = r.failWorkspace(workspace, timeoutErr.Error(), reqLogger, &reconcileStatus) | ||
} | ||
} | ||
return r.updateWorkspaceStatus(clusterWorkspace, reqLogger, &reconcileStatus, reconcileResult, err) | ||
}() | ||
|
||
|
@@ -286,6 +302,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
message = routingStatus.Message | ||
} | ||
reconcileStatus.setConditionFalse(dw.DevWorkspaceRoutingReady, message) | ||
if !routingStatus.Requeue && routingStatus.Err == nil { | ||
return reconcile.Result{RequeueAfter: startingWorkspaceRequeueInterval}, nil | ||
} | ||
return reconcile.Result{Requeue: routingStatus.Requeue}, routingStatus.Err | ||
} | ||
reconcileStatus.setConditionTrue(dw.DevWorkspaceRoutingReady, "Networking ready") | ||
|
@@ -333,6 +352,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
// FailStartup is not possible for generating the serviceaccount | ||
reqLogger.Info("Waiting for workspace ServiceAccount") | ||
reconcileStatus.setConditionFalse(dw.DevWorkspaceServiceAccountReady, "Waiting for DevWorkspace ServiceAccount") | ||
if !serviceAcctStatus.Requeue && serviceAcctStatus.Err == nil { | ||
return reconcile.Result{RequeueAfter: startingWorkspaceRequeueInterval}, nil | ||
} | ||
return reconcile.Result{Requeue: serviceAcctStatus.Requeue}, serviceAcctStatus.Err | ||
} | ||
serviceAcctName := serviceAcctStatus.ServiceAccountName | ||
|
@@ -341,6 +363,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
pullSecretStatus := wsprovision.PullSecrets(clusterAPI, serviceAcctName, workspace.GetNamespace()) | ||
if !pullSecretStatus.Continue { | ||
reconcileStatus.setConditionFalse(conditions.PullSecretsReady, "Waiting for DevWorkspace pull secrets") | ||
if !pullSecretStatus.Requeue && pullSecretStatus.Err == nil { | ||
return reconcile.Result{RequeueAfter: startingWorkspaceRequeueInterval}, nil | ||
} | ||
return reconcile.Result{Requeue: pullSecretStatus.Requeue}, pullSecretStatus.Err | ||
} | ||
allPodAdditions = append(allPodAdditions, pullSecretStatus.PodAdditions) | ||
|
@@ -355,6 +380,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request | |
} | ||
reqLogger.Info("Waiting on deployment to be ready") | ||
reconcileStatus.setConditionFalse(conditions.DeploymentReady, "Waiting for workspace deployment") | ||
if !deploymentStatus.Requeue && deploymentStatus.Err == nil { | ||
return reconcile.Result{RequeueAfter: startingWorkspaceRequeueInterval}, nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does here the last or the first internal win?
The question is, how many times reconcile loops our There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My understanding is that reconciling due to an event cancels out a requeueAfter, but it's hard to check for sure. In my testing, I don't see bursts of "waiting on deployment" in the logs, which is what would likely happen if we were stacking requeues. |
||
} | ||
return reconcile.Result{Requeue: deploymentStatus.Requeue}, deploymentStatus.Err | ||
} | ||
reconcileStatus.setConditionTrue(conditions.DeploymentReady, "DevWorkspace deployment ready") | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if 5 secs is always the best choice or we can make it dynamic and like to minute, 2,3 in case of 5minutes timeout?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mostly picked 5 seconds because it feels long enough that it's not burdening the controller (we can reconcile many times a second) but also short enough to avoid strange issues in setting a timeout (e.g. if we set a 1 minute requeue, what happens if the config specifies a timeout of 1 minute 15 seconds?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think there is a case for such a precise timeout. Maybe we can declare our precision to like 1minute or 30 seconds.
And actually, I thought about reconciling after (timeout + last transition time - now + 5 sec), which will initiate reconcile loop after 5 sec when potentially we need to kill that.
But I'm OK with any if it does not generate redundant loading.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tested this informally by starting a workspace that will timeout (ignore
FailedScheduling
events and ask for 100Gi in Theia). While the workspace is looping on checking the deployment every 5 seconds, I ranto trigger 10 reconciles to the object, with the assumption that each of those would also start a
RequeueAfter
. However, after the 10 quick reconciles are completed, the controller goes back to queuing reconciles every five seconds, rather than 10 reconciles every 5 seconds, so it seems like RequeueAfter is cancelled if an event triggers a reconcile.I agree that 5 seconds may be the wrong value here; we should tweak this later.