Skip to content

[ws-manager-mk2] Workspace heartbeats and timeouts #16122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions components/ws-manager-api/go/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ type Configuration struct {
WorkspaceClasses map[string]*WorkspaceClass `json:"workspaceClass"`
// DebugWorkspacePod adds extra finalizer to workspace to prevent it from shutting down. Helps to debug.
DebugWorkspacePod bool `json:"debugWorkspacePod,omitempty"`
// TimeoutMaxConcurrentReconciles configures the max amount of concurrent workspace reconciliations on
// the timeout controller.
TimeoutMaxConcurrentReconciles int `json:"timeoutMaxConcurrentReconciles,omitempty"`
}

type WorkspaceClass struct {
Expand Down
5 changes: 4 additions & 1 deletion components/ws-manager-api/go/crd/v1/workspace_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,10 @@ const (
WorkspaceConditionTimeout WorkspaceCondition = "Timeout"

// UserActivity is the time when MarkActive was first called on the workspace
WorkspaceConditionUserActivity WorkspaceCondition = "UserActivity"
WorkspaceConditionFirstUserActivity WorkspaceCondition = "FirstUserActivity"

// Closed indicates that a workspace is marked as closed. This will shorten its timeout.
WorkspaceConditionClosed WorkspaceCondition = "Closed"

// HeadlessTaskFailed indicates that a headless workspace task failed
WorkspaceConditionsHeadlessTaskFailed WorkspaceCondition = "HeadlessTaskFailed"
Expand Down
5 changes: 5 additions & 0 deletions components/ws-manager-mk2/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,8 @@ func isPodBeingDeleted(pod *corev1.Pod) bool {
// if the pod is being deleted the only marker we have is that the deletionTimestamp is set
return pod.ObjectMeta.DeletionTimestamp != nil
}

// isWorkspaceBeingDeleted returns true if the workspace resource is currently being deleted.
func isWorkspaceBeingDeleted(ws *workspacev1.Workspace) bool {
return ws.ObjectMeta.DeletionTimestamp != nil
}
22 changes: 22 additions & 0 deletions components/ws-manager-mk2/controllers/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"context"
"path/filepath"
"testing"
"time"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
Expand All @@ -19,6 +20,8 @@ import (
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"

"github.com/gitpod-io/gitpod/common-go/util"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
"github.com/gitpod-io/gitpod/ws-manager/api/config"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
//+kubebuilder:scaffold:imports
Expand Down Expand Up @@ -103,6 +106,25 @@ var _ = BeforeSuite(func() {
}).SetupWithManager(k8sManager)
Expect(err).ToNot(HaveOccurred())

activity := activity.WorkspaceActivity{}
timeoutReconciler, err := NewTimeoutReconciler(k8sManager.GetClient(), config.Configuration{
Timeouts: config.WorkspaceTimeoutConfiguration{
TotalStartup: util.Duration(1 * time.Minute),
Initialization: util.Duration(1 * time.Minute),
RegularWorkspace: util.Duration(1 * time.Minute),
MaxLifetime: util.Duration(1 * time.Minute),
HeadlessWorkspace: util.Duration(1 * time.Minute),
AfterClose: util.Duration(1 * time.Minute),
ContentFinalization: util.Duration(1 * time.Minute),
Stopping: util.Duration(1 * time.Minute),
Interrupted: util.Duration(1 * time.Minute),
},
TimeoutMaxConcurrentReconciles: 2,
}, &activity)
Expect(err).ToNot(HaveOccurred())
err = timeoutReconciler.SetupWithManager(k8sManager)
Expect(err).ToNot(HaveOccurred())

ctx, cancel = context.WithCancel(context.TODO())

go func() {
Expand Down
240 changes: 240 additions & 0 deletions components/ws-manager-mk2/controllers/timeout_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.

package controllers

import (
"context"
"fmt"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/gitpod-io/gitpod/common-go/util"
wsactivity "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
)

func NewTimeoutReconciler(c client.Client, cfg config.Configuration, activity *wsactivity.WorkspaceActivity) (*TimeoutReconciler, error) {
reconcileInterval := time.Duration(cfg.HeartbeatInterval)
// Reconcile interval is half the heartbeat interval to catch timed out workspaces in time.
// See https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem why we need this.
reconcileInterval /= 2

return &TimeoutReconciler{
Client: c,
Config: cfg,
activity: activity,
reconcileInterval: reconcileInterval,
ctrlStartTime: time.Now().UTC(),
}, nil
}

// TimeoutReconciler reconciles workspace timeouts. This is a separate reconciler, as it
// always requeues events for existing workspaces such that timeouts are checked on (at least)
// a specified interval. The reconcile loop should therefore be light-weight as it's repeatedly
// reconciling all workspaces in the cluster.
type TimeoutReconciler struct {
client.Client

Config config.Configuration
activity *wsactivity.WorkspaceActivity
reconcileInterval time.Duration
ctrlStartTime time.Time
}

//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch

// Reconcile will check the given workspace for timing out. When done, a new event gets
// requeued automatically to ensure the workspace gets reconciled at least every reconcileInterval.
func (r *TimeoutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
log := log.FromContext(ctx).WithValues("ws", req.NamespacedName)

var workspace workspacev1.Workspace
if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil {
if !apierrors.IsNotFound(err) {
log.Error(err, "unable to fetch workspace")
}
// We'll ignore not-found errors, since they can't be fixed by an immediate
// requeue (we'll need to wait for a new notification), and we can get them
// on deleted requests.
// On any other error, let the controller requeue an event with exponential
// backoff.
return ctrl.Result{}, client.IgnoreNotFound(err)
}

if conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) {
// Workspace has already been marked as timed out.
// Return and don't requeue another reconciliation.
return ctrl.Result{}, nil
}

// The workspace hasn't timed out yet. After this point, we always
// want to requeue a reconciliation after the configured interval.
defer func() {
result.RequeueAfter = r.reconcileInterval
}()

timedout, err := r.isWorkspaceTimedOut(&workspace)
if err != nil {
log.Error(err, "failed to check for workspace timeout")
return ctrl.Result{}, err
}

if timedout == "" {
// Hasn't timed out.
return ctrl.Result{}, nil
}

// Workspace timed out, set Timeout condition.
log.Info("Workspace timed out", "reason", timedout)
workspace.Status.Conditions = AddUniqueCondition(workspace.Status.Conditions, metav1.Condition{
Type: string(workspacev1.WorkspaceConditionTimeout),
Status: metav1.ConditionTrue,
LastTransitionTime: metav1.Now(),
Reason: "TimedOut",
Message: timedout,
})

if err = r.Client.Status().Update(ctx, &workspace); err != nil {
log.Error(err, "Failed to update workspace status with Timeout condition")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}

type timeoutActivity string

const (
activityInit timeoutActivity = "initialization"
activityStartup timeoutActivity = "startup"
activityCreatingContainers timeoutActivity = "creating containers"
activityPullingImages timeoutActivity = "pulling images"
activityRunningHeadless timeoutActivity = "running the headless workspace"
activityNone timeoutActivity = "period of inactivity"
activityMaxLifetime timeoutActivity = "maximum lifetime"
activityClosed timeoutActivity = "after being closed"
activityInterrupted timeoutActivity = "workspace interruption"
activityStopping timeoutActivity = "stopping"
activityBackup timeoutActivity = "backup"
)

// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.
// This function does NOT use the Timeout condition, but rather is used to set that condition in the first place.
func (r *TimeoutReconciler) isWorkspaceTimedOut(ws *workspacev1.Workspace) (reason string, err error) {
timeouts := r.Config.Timeouts
phase := ws.Status.Phase

decide := func(start time.Time, timeout util.Duration, activity timeoutActivity) (string, error) {
td := time.Duration(timeout)
inactivity := time.Since(start)
if inactivity < td {
return "", nil
}

return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td)), nil
}

start := ws.ObjectMeta.CreationTimestamp.Time
lastActivity := r.activity.GetLastActivity(ws.Name)
isClosed := conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed))

switch phase {
case workspacev1.WorkspacePhasePending:
return decide(start, timeouts.Initialization, activityInit)

case workspacev1.WorkspacePhaseInitializing:
return decide(start, timeouts.TotalStartup, activityStartup)

case workspacev1.WorkspacePhaseCreating:
activity := activityCreatingContainers
// TODO:
// if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {
// activity = activityPullingImages
// }
return decide(start, timeouts.TotalStartup, activity)

case workspacev1.WorkspacePhaseRunning:
// First check is always for the max lifetime
if msg, err := decide(start, timeouts.MaxLifetime, activityMaxLifetime); msg != "" {
return msg, err
}

timeout := timeouts.RegularWorkspace
if ctv := ws.Spec.Timeout.Time; ctv != nil {
timeout = util.Duration(ctv.Duration)
}
activity := activityNone
if ws.Status.Headless {
timeout = timeouts.HeadlessWorkspace
lastActivity = &start
activity = activityRunningHeadless
} else if lastActivity == nil {
// The workspace is up and running, but the user has never produced any activity, OR the controller
// has restarted and not yet received a heartbeat for this workspace (since heartbeats are stored
// in-memory and reset on restart).
// First check whether the controller has restarted during this workspace's lifetime.
// If it has, use the FirstUserActivity condition to determine whether there had already been any user activity
// before the controller restart.
// If the controller started before the workspace, then the user hasn't produced any activity yet.
if r.ctrlStartTime.After(start) && conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFirstUserActivity)) {
// The controller restarted during this workspace's lifetime, and the workspace has had activity before the restart,
// so the last activity has been lost on restart. Therefore, "reset" the timeout and measure only since the controller startup time.
start = r.ctrlStartTime
} else {
// This workspace hasn't had any user activity yet (also not before a potential controller restart).
// So check for a startup timeout, and measure since workspace creation time.
timeout = timeouts.TotalStartup
}
return decide(start, timeout, activityNone)
} else if isClosed {
return decide(*lastActivity, timeouts.AfterClose, activityClosed)
}
return decide(*lastActivity, timeout, activity)

case workspacev1.WorkspacePhaseStopping:
if isWorkspaceBeingDeleted(ws) && conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) {
// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted.
// We basically don't expect a workspace to be in content finalization before it's been deleted.
return decide(ws.DeletionTimestamp.Time, timeouts.ContentFinalization, activityBackup)
} else if !isWorkspaceBeingDeleted(ws) {
// workspaces that have not been deleted have never timed out
return "", nil
} else {
return decide(ws.DeletionTimestamp.Time, timeouts.Stopping, activityStopping)
}

default:
// The only other phases we can be in is stopped which is pointless to time out
return "", nil
}
}

func formatDuration(d time.Duration) string {
d = d.Round(time.Minute)
h := d / time.Hour
d -= h * time.Hour
m := d / time.Minute
return fmt.Sprintf("%02dh%02dm", h, m)
}

// SetupWithManager sets up the controller with the Manager.
func (r *TimeoutReconciler) SetupWithManager(mgr ctrl.Manager) error {
maxConcurrentReconciles := r.Config.TimeoutMaxConcurrentReconciles
if maxConcurrentReconciles <= 0 {
maxConcurrentReconciles = 1
}

return ctrl.NewControllerManagedBy(mgr).
WithOptions(controller.Options{MaxConcurrentReconciles: maxConcurrentReconciles}).
For(&workspacev1.Workspace{}).
Complete(r)
}
Loading