devfile · sleshchenko · Sep 30, 2021 · Sep 17, 2021 · Sep 17, 2021 · Sep 21, 2021
@@ -61,6 +61,11 @@ type WorkspaceConfig struct {
 	// requires support in the workspace being started. If not specified, the default
 	// value of "15m" is used.
 	IdleTimeout string `json:"idleTimeout,omitempty"`
+	// ProgressTimeout determines the maximum duration a DevWorkspace can be in
+	// a "Starting" or "Failing" phase without progressing before it is automatically failed.
+	// Duration should be specified in a format parseable by Go's time package, e.g.
+	// "15m", "20s", "1h30m", etc. If not specified, the default value of "5m" is used.
+	ProgressTimeout string `json:"progressTimeout,omitempty"`
 	// IgnoredUnrecoverableEvents defines a list of Kubernetes event names that should
 	// be ignored when deciding to fail a DevWorkspace startup. This option should be used
 	// if a transient cluster issue is triggering false-positives (for example, if

@@ -59,6 +59,10 @@ import (
 	dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2"
 )
 
+const (
+	startingWorkspaceRequeueInterval = 5 * time.Second
+)
+
 // DevWorkspaceReconciler reconciles a DevWorkspace object
 type DevWorkspaceReconciler struct {
 	client.Client
@@ -134,7 +138,11 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
 		// If debug annotation is present, leave the deployment in place to let users
 		// view logs.
 		if workspace.Annotations[constants.DevWorkspaceDebugStartAnnotation] == "true" {
-			return reconcile.Result{}, nil
+			if isTimeout, err := checkForFailingTimeout(workspace); err != nil {
+				return reconcile.Result{}, err
+			} else if !isTimeout {
+				return reconcile.Result{}, nil
+			}
 		}
 
 		patch := []byte(`{"spec":{"started": false}}`)
@@ -181,8 +189,16 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
 	clusterWorkspace := workspace.DeepCopy()
 	timingInfo := map[string]string{}
 	timing.SetTime(timingInfo, timing.DevWorkspaceStarted)
+
 	defer func() (reconcile.Result, error) {
 		r.syncTimingToCluster(ctx, clusterWorkspace, timingInfo, reqLogger)
+		// Don't accidentally suppress errors by overwriting here; only check for timeout when no error
+		// encountered in main reconcile loop.
+		if err == nil {
+			if timeoutErr := checkForStartTimeout(clusterWorkspace); timeoutErr != nil {
+				reconcileResult, err = r.failWorkspace(workspace, timeoutErr.Error(), reqLogger, &reconcileStatus)
+			}
+		}
 		return r.updateWorkspaceStatus(clusterWorkspace, reqLogger, &reconcileStatus, reconcileResult, err)
 	}()
 
@@ -286,6 +302,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
 			message = routingStatus.Message
 		}
 		reconcileStatus.setConditionFalse(dw.DevWorkspaceRoutingReady, message)
+		if !routingStatus.Requeue && routingStatus.Err == nil {
+			return reconcile.Result{RequeueAfter: startingWorkspaceRequeueInterval}, nil
+		}
 		return reconcile.Result{Requeue: routingStatus.Requeue}, routingStatus.Err
 	}
 	reconcileStatus.setConditionTrue(dw.DevWorkspaceRoutingReady, "Networking ready")
@@ -333,6 +352,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
 		// FailStartup is not possible for generating the serviceaccount
 		reqLogger.Info("Waiting for workspace ServiceAccount")
 		reconcileStatus.setConditionFalse(dw.DevWorkspaceServiceAccountReady, "Waiting for DevWorkspace ServiceAccount")
+		if !serviceAcctStatus.Requeue && serviceAcctStatus.Err == nil {
+			return reconcile.Result{RequeueAfter: startingWorkspaceRequeueInterval}, nil
+		}
 		return reconcile.Result{Requeue: serviceAcctStatus.Requeue}, serviceAcctStatus.Err
 	}
 	serviceAcctName := serviceAcctStatus.ServiceAccountName
@@ -341,6 +363,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
 	pullSecretStatus := wsprovision.PullSecrets(clusterAPI, serviceAcctName, workspace.GetNamespace())
 	if !pullSecretStatus.Continue {
 		reconcileStatus.setConditionFalse(conditions.PullSecretsReady, "Waiting for DevWorkspace pull secrets")
+		if !pullSecretStatus.Requeue && pullSecretStatus.Err == nil {
+			return reconcile.Result{RequeueAfter: startingWorkspaceRequeueInterval}, nil
+		}
 		return reconcile.Result{Requeue: pullSecretStatus.Requeue}, pullSecretStatus.Err
 	}
 	allPodAdditions = append(allPodAdditions, pullSecretStatus.PodAdditions)
@@ -355,6 +380,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
 		}
 		reqLogger.Info("Waiting on deployment to be ready")
 		reconcileStatus.setConditionFalse(conditions.DeploymentReady, "Waiting for workspace deployment")
+		if !deploymentStatus.Requeue && deploymentStatus.Err == nil {
+			return reconcile.Result{RequeueAfter: startingWorkspaceRequeueInterval}, nil
+		}
 		return reconcile.Result{Requeue: deploymentStatus.Requeue}, deploymentStatus.Err
 	}
 	reconcileStatus.setConditionTrue(conditions.DeploymentReady, "DevWorkspace deployment ready")

@@ -19,6 +19,7 @@ import (
 	"net/http"
 	"net/url"
 	"sort"
+	"time"
 
 	dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2"
 
@@ -31,6 +32,7 @@ import (
 	"github.com/devfile/devworkspace-operator/apis/controller/v1alpha1"
 	"github.com/devfile/devworkspace-operator/controllers/workspace/metrics"
 	"github.com/devfile/devworkspace-operator/pkg/conditions"
+	"github.com/devfile/devworkspace-operator/pkg/config"
 	wsprovision "github.com/devfile/devworkspace-operator/pkg/provision/workspace"
 )
 
@@ -238,3 +240,54 @@ func updateMetricsForPhase(workspace *dw.DevWorkspace, oldPhase, newPhase dw.Dev
 		metrics.WorkspaceStarted(workspace, logger)
 	}
 }
+
+// checkForStartTimeout checks if the provided workspace has not progressed for longer than the configured
+// startup timeout. This is determined by checking to see if the last condition transition time is more
+// than [timeout] duration ago. Workspaces that are not in the "Starting" phase cannot timeout. Returns
+// an error with message when timeout is reached.
+func checkForStartTimeout(workspace *dw.DevWorkspace) error {
+	if workspace.Status.Phase != dw.DevWorkspaceStatusStarting {
+		return nil
+	}
+	timeout, err := time.ParseDuration(config.Workspace.ProgressTimeout)
+	if err != nil {
+		return fmt.Errorf("invalid duration specified for timeout: %w", err)
+	}
+	currTime := clock.Now()
+	lastUpdateTime := time.Time{}
+	for _, condition := range workspace.Status.Conditions {
+		if condition.LastTransitionTime.Time.After(lastUpdateTime) {
+			lastUpdateTime = condition.LastTransitionTime.Time
+		}
+	}
+	if !lastUpdateTime.IsZero() && lastUpdateTime.Add(timeout).Before(currTime) {
+		return fmt.Errorf("devworkspace failed to progress past phase '%s' for longer than timeout (%s)",
+			workspace.Status.Phase, config.Workspace.ProgressTimeout)
+	}
+	return nil
+}
+
+// checkForFailingTimeout checks that the current workspace has not been in the "Failing" state for longer than the
+// configured progress timeout. If the workspace is not in the Failing state or does not have a DevWorkspaceFailed
+// condition set, returns false. Otherwise, returns true if the workspace has timed out. Returns an error if
+// timeout is configured with an unparsable duration.
+func checkForFailingTimeout(workspace *dw.DevWorkspace) (isTimedOut bool, err error) {
+	if workspace.Status.Phase != devworkspacePhaseFailing {
+		return false, nil
+	}
+	timeout, err := time.ParseDuration(config.Workspace.ProgressTimeout)
+	if err != nil {
+		return false, fmt.Errorf("invalid duration specified for timeout: %w", err)
+	}
+	currTime := clock.Now()
+	failedTime := time.Time{}
+	for _, condition := range workspace.Status.Conditions {
+		if condition.Type == dw.DevWorkspaceFailedStart {
+			failedTime = condition.LastTransitionTime.Time
+		}
+	}
+	if !failedTime.IsZero() && failedTime.Add(timeout).Before(currTime) {
+		return true, nil
+	}
+	return false, nil
+}
@@ -24,5 +24,6 @@ var DefaultConfig = &v1alpha1.OperatorConfiguration{
 		ImagePullPolicy: "Always",
 		PVCName:         "claim-devworkspace",
 		IdleTimeout:     "15m",
+		ProgressTimeout: "5m",
 	},
 }
@@ -205,6 +205,9 @@ func mergeConfig(from, to *controller.OperatorConfiguration) {
 		if from.Workspace.IdleTimeout != "" {
 			to.Workspace.IdleTimeout = from.Workspace.IdleTimeout
 		}
+		if from.Workspace.ProgressTimeout != "" {
+			to.Workspace.ProgressTimeout = from.Workspace.ProgressTimeout
+		}
 		if from.Workspace.IgnoredUnrecoverableEvents != nil {
 			to.Workspace.IgnoredUnrecoverableEvents = from.Workspace.IgnoredUnrecoverableEvents
 		}