gitpod-io · WVerlaek · Jan 31, 2023
@@ -125,6 +125,9 @@ type Configuration struct {
 	WorkspaceClasses map[string]*WorkspaceClass `json:"workspaceClass"`
 	// DebugWorkspacePod adds extra finalizer to workspace to prevent it from shutting down. Helps to debug.
 	DebugWorkspacePod bool `json:"debugWorkspacePod,omitempty"`
+	// TimeoutMaxConcurrentReconciles configures the max amount of concurrent workspace reconciliations on
+	// the timeout controller.
+	TimeoutMaxConcurrentReconciles int `json:"timeoutMaxConcurrentReconciles,omitempty"`
 }
 
 type WorkspaceClass struct {

@@ -145,7 +145,10 @@ const (
 	WorkspaceConditionTimeout WorkspaceCondition = "Timeout"
 
 	// UserActivity is the time when MarkActive was first called on the workspace
-	WorkspaceConditionUserActivity WorkspaceCondition = "UserActivity"
+	WorkspaceConditionFirstUserActivity WorkspaceCondition = "FirstUserActivity"
+
+	// Closed indicates that a workspace is marked as closed. This will shorten its timeout.
+	WorkspaceConditionClosed WorkspaceCondition = "Closed"
 
 	// HeadlessTaskFailed indicates that a headless workspace task failed
 	WorkspaceConditionsHeadlessTaskFailed WorkspaceCondition = "HeadlessTaskFailed"

@@ -287,3 +287,8 @@ func isPodBeingDeleted(pod *corev1.Pod) bool {
 	// if the pod is being deleted the only marker we have is that the deletionTimestamp is set
 	return pod.ObjectMeta.DeletionTimestamp != nil
 }
+
+// isWorkspaceBeingDeleted returns true if the workspace resource is currently being deleted.
+func isWorkspaceBeingDeleted(ws *workspacev1.Workspace) bool {
+	return ws.ObjectMeta.DeletionTimestamp != nil
+}
@@ -8,6 +8,7 @@ import (
 	"context"
 	"path/filepath"
 	"testing"
+	"time"
 
 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/gomega"
@@ -19,6 +20,8 @@ import (
 	logf "sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 
+	"github.com/gitpod-io/gitpod/common-go/util"
+	"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
 	"github.com/gitpod-io/gitpod/ws-manager/api/config"
 	workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
 	//+kubebuilder:scaffold:imports
@@ -103,6 +106,25 @@ var _ = BeforeSuite(func() {
 	}).SetupWithManager(k8sManager)
 	Expect(err).ToNot(HaveOccurred())
 
+	activity := activity.WorkspaceActivity{}
+	timeoutReconciler, err := NewTimeoutReconciler(k8sManager.GetClient(), config.Configuration{
+		Timeouts: config.WorkspaceTimeoutConfiguration{
+			TotalStartup:        util.Duration(1 * time.Minute),
+			Initialization:      util.Duration(1 * time.Minute),
+			RegularWorkspace:    util.Duration(1 * time.Minute),
+			MaxLifetime:         util.Duration(1 * time.Minute),
+			HeadlessWorkspace:   util.Duration(1 * time.Minute),
+			AfterClose:          util.Duration(1 * time.Minute),
+			ContentFinalization: util.Duration(1 * time.Minute),
+			Stopping:            util.Duration(1 * time.Minute),
+			Interrupted:         util.Duration(1 * time.Minute),
+		},
+		TimeoutMaxConcurrentReconciles: 2,
+	}, &activity)
+	Expect(err).ToNot(HaveOccurred())
+	err = timeoutReconciler.SetupWithManager(k8sManager)
+	Expect(err).ToNot(HaveOccurred())
+
 	ctx, cancel = context.WithCancel(context.TODO())
 
 	go func() {

@@ -0,0 +1,240 @@
+// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
+// Licensed under the GNU Affero General Public License (AGPL).
+// See License-AGPL.txt in the project root for license information.
+
+package controllers
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/controller"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+
+	"github.com/gitpod-io/gitpod/common-go/util"
+	wsactivity "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
+	config "github.com/gitpod-io/gitpod/ws-manager/api/config"
+	workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
+)
+
+func NewTimeoutReconciler(c client.Client, cfg config.Configuration, activity *wsactivity.WorkspaceActivity) (*TimeoutReconciler, error) {
+	reconcileInterval := time.Duration(cfg.HeartbeatInterval)
+	// Reconcile interval is half the heartbeat interval to catch timed out workspaces in time.
+	// See https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem why we need this.
+	reconcileInterval /= 2
+
+	return &TimeoutReconciler{
+		Client:            c,
+		Config:            cfg,
+		activity:          activity,
+		reconcileInterval: reconcileInterval,
+		ctrlStartTime:     time.Now().UTC(),
+	}, nil
+}
+
+// TimeoutReconciler reconciles workspace timeouts. This is a separate reconciler, as it
+// always requeues events for existing workspaces such that timeouts are checked on (at least)
+// a specified interval. The reconcile loop should therefore be light-weight as it's repeatedly
+// reconciling all workspaces in the cluster.
+type TimeoutReconciler struct {
+	client.Client
+
+	Config            config.Configuration
+	activity          *wsactivity.WorkspaceActivity
+	reconcileInterval time.Duration
+	ctrlStartTime     time.Time
+}
+
+//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete
+//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch
+
+// Reconcile will check the given workspace for timing out. When done, a new event gets
+// requeued automatically to ensure the workspace gets reconciled at least every reconcileInterval.
+func (r *TimeoutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
+	log := log.FromContext(ctx).WithValues("ws", req.NamespacedName)
+
+	var workspace workspacev1.Workspace
+	if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil {
+		if !apierrors.IsNotFound(err) {
+			log.Error(err, "unable to fetch workspace")
+		}
+		// We'll ignore not-found errors, since they can't be fixed by an immediate
+		// requeue (we'll need to wait for a new notification), and we can get them
+		// on deleted requests.
+		// On any other error, let the controller requeue an event with exponential
+		// backoff.
+		return ctrl.Result{}, client.IgnoreNotFound(err)
+	}
+
+	if conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) {
+		// Workspace has already been marked as timed out.
+		// Return and don't requeue another reconciliation.
+		return ctrl.Result{}, nil
+	}
+
+	// The workspace hasn't timed out yet. After this point, we always
+	// want to requeue a reconciliation after the configured interval.
+	defer func() {
+		result.RequeueAfter = r.reconcileInterval
+	}()
+
+	timedout, err := r.isWorkspaceTimedOut(&workspace)
+	if err != nil {
+		log.Error(err, "failed to check for workspace timeout")
+		return ctrl.Result{}, err
+	}
+
+	if timedout == "" {
+		// Hasn't timed out.
+		return ctrl.Result{}, nil
+	}
+
+	// Workspace timed out, set Timeout condition.
+	log.Info("Workspace timed out", "reason", timedout)
+	workspace.Status.Conditions = AddUniqueCondition(workspace.Status.Conditions, metav1.Condition{
+		Type:               string(workspacev1.WorkspaceConditionTimeout),
+		Status:             metav1.ConditionTrue,
+		LastTransitionTime: metav1.Now(),
+		Reason:             "TimedOut",
+		Message:            timedout,
+	})
+
+	if err = r.Client.Status().Update(ctx, &workspace); err != nil {
+		log.Error(err, "Failed to update workspace status with Timeout condition")
+		return ctrl.Result{}, err
+	}
+	return ctrl.Result{}, nil
+}
+
+type timeoutActivity string
+
+const (
+	activityInit               timeoutActivity = "initialization"
+	activityStartup            timeoutActivity = "startup"
+	activityCreatingContainers timeoutActivity = "creating containers"
+	activityPullingImages      timeoutActivity = "pulling images"
+	activityRunningHeadless    timeoutActivity = "running the headless workspace"
+	activityNone               timeoutActivity = "period of inactivity"
+	activityMaxLifetime        timeoutActivity = "maximum lifetime"
+	activityClosed             timeoutActivity = "after being closed"
+	activityInterrupted        timeoutActivity = "workspace interruption"
+	activityStopping           timeoutActivity = "stopping"
+	activityBackup             timeoutActivity = "backup"
+)
+
+// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.
+// This function does NOT use the Timeout condition, but rather is used to set that condition in the first place.
+func (r *TimeoutReconciler) isWorkspaceTimedOut(ws *workspacev1.Workspace) (reason string, err error) {
+	timeouts := r.Config.Timeouts
+	phase := ws.Status.Phase
+
+	decide := func(start time.Time, timeout util.Duration, activity timeoutActivity) (string, error) {
+		td := time.Duration(timeout)
+		inactivity := time.Since(start)
+		if inactivity < td {
+			return "", nil
+		}
+
+		return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td)), nil
+	}
+
+	start := ws.ObjectMeta.CreationTimestamp.Time
+	lastActivity := r.activity.GetLastActivity(ws.Name)
+	isClosed := conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed))
+
+	switch phase {
+	case workspacev1.WorkspacePhasePending:
+		return decide(start, timeouts.Initialization, activityInit)
+
+	case workspacev1.WorkspacePhaseInitializing:
+		return decide(start, timeouts.TotalStartup, activityStartup)
+
+	case workspacev1.WorkspacePhaseCreating:
+		activity := activityCreatingContainers
+		// TODO:
+		// if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {
+		// 	activity = activityPullingImages
+		// }
+		return decide(start, timeouts.TotalStartup, activity)
+
+	case workspacev1.WorkspacePhaseRunning:
+		// First check is always for the max lifetime
+		if msg, err := decide(start, timeouts.MaxLifetime, activityMaxLifetime); msg != "" {
+			return msg, err
+		}
+
+		timeout := timeouts.RegularWorkspace
+		if ctv := ws.Spec.Timeout.Time; ctv != nil {
+			timeout = util.Duration(ctv.Duration)
+		}
+		activity := activityNone
+		if ws.Status.Headless {
+			timeout = timeouts.HeadlessWorkspace
+			lastActivity = &start
+			activity = activityRunningHeadless
+		} else if lastActivity == nil {
+			// The workspace is up and running, but the user has never produced any activity, OR the controller
+			// has restarted and not yet received a heartbeat for this workspace (since heartbeats are stored
+			// in-memory and reset on restart).
+			// First check whether the controller has restarted during this workspace's lifetime.
+			// If it has, use the FirstUserActivity condition to determine whether there had already been any user activity
+			// before the controller restart.
+			// If the controller started before the workspace, then the user hasn't produced any activity yet.
+			if r.ctrlStartTime.After(start) && conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFirstUserActivity)) {
+				// The controller restarted during this workspace's lifetime, and the workspace has had activity before the restart,
+				// so the last activity has been lost on restart. Therefore, "reset" the timeout and measure only since the controller startup time.
+				start = r.ctrlStartTime
+			} else {
+				// This workspace hasn't had any user activity yet (also not before a potential controller restart).
+				// So check for a startup timeout, and measure since workspace creation time.
+				timeout = timeouts.TotalStartup
+			}
+			return decide(start, timeout, activityNone)
+		} else if isClosed {
+			return decide(*lastActivity, timeouts.AfterClose, activityClosed)
+		}
+		return decide(*lastActivity, timeout, activity)
+
+	case workspacev1.WorkspacePhaseStopping:
+		if isWorkspaceBeingDeleted(ws) && conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) {
+			// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted.
+			//         We basically don't expect a workspace to be in content finalization before it's been deleted.
+			return decide(ws.DeletionTimestamp.Time, timeouts.ContentFinalization, activityBackup)
+		} else if !isWorkspaceBeingDeleted(ws) {
+			// workspaces that have not been deleted have never timed out
+			return "", nil
+		} else {
+			return decide(ws.DeletionTimestamp.Time, timeouts.Stopping, activityStopping)
+		}
+
+	default:
+		// The only other phases we can be in is stopped which is pointless to time out
+		return "", nil
+	}
+}
+
+func formatDuration(d time.Duration) string {
+	d = d.Round(time.Minute)
+	h := d / time.Hour
+	d -= h * time.Hour
+	m := d / time.Minute
+	return fmt.Sprintf("%02dh%02dm", h, m)
+}
+
+// SetupWithManager sets up the controller with the Manager.
+func (r *TimeoutReconciler) SetupWithManager(mgr ctrl.Manager) error {
+	maxConcurrentReconciles := r.Config.TimeoutMaxConcurrentReconciles
+	if maxConcurrentReconciles <= 0 {
+		maxConcurrentReconciles = 1
+	}
+
+	return ctrl.NewControllerManagedBy(mgr).
+		WithOptions(controller.Options{MaxConcurrentReconciles: maxConcurrentReconciles}).
+		For(&workspacev1.Workspace{}).
+		Complete(r)
+}