-
Notifications
You must be signed in to change notification settings - Fork 1.3k
[ws-manager-mk2] Workspace timeouts #16209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
236 changes: 236 additions & 0 deletions
236
components/ws-manager-mk2/controllers/timeout_controller.go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
// Copyright (c) 2022 Gitpod GmbH. All rights reserved. | ||
// Licensed under the GNU Affero General Public License (AGPL). | ||
// See License-AGPL.txt in the project root for license information. | ||
|
||
package controllers | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
"sigs.k8s.io/controller-runtime/pkg/controller" | ||
"sigs.k8s.io/controller-runtime/pkg/log" | ||
|
||
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes" | ||
"github.com/gitpod-io/gitpod/common-go/util" | ||
wsactivity "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity" | ||
config "github.com/gitpod-io/gitpod/ws-manager/api/config" | ||
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" | ||
) | ||
|
||
func NewTimeoutReconciler(c client.Client, cfg config.Configuration, activity *wsactivity.WorkspaceActivity) (*TimeoutReconciler, error) { | ||
reconcileInterval := time.Duration(cfg.HeartbeatInterval) | ||
// Reconcile interval is half the heartbeat interval to catch timed out workspaces in time. | ||
// See https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem why we need this. | ||
reconcileInterval /= 2 | ||
|
||
return &TimeoutReconciler{ | ||
Client: c, | ||
Config: cfg, | ||
activity: activity, | ||
reconcileInterval: reconcileInterval, | ||
ctrlStartTime: time.Now().UTC(), | ||
}, nil | ||
} | ||
|
||
// TimeoutReconciler reconciles workspace timeouts. This is a separate reconciler, as it | ||
// always requeues events for existing workspaces such that timeouts are checked on (at least) | ||
// a specified interval. The reconcile loop should therefore be light-weight as it's repeatedly | ||
// reconciling all workspaces in the cluster. | ||
type TimeoutReconciler struct { | ||
client.Client | ||
|
||
Config config.Configuration | ||
activity *wsactivity.WorkspaceActivity | ||
reconcileInterval time.Duration | ||
ctrlStartTime time.Time | ||
} | ||
|
||
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete | ||
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch | ||
|
||
// Reconcile will check the given workspace for timing out. When done, a new event gets | ||
// requeued automatically to ensure the workspace gets reconciled at least every reconcileInterval. | ||
func (r *TimeoutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) { | ||
log := log.FromContext(ctx).WithValues("ws", req.NamespacedName) | ||
|
||
var workspace workspacev1.Workspace | ||
if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil { | ||
if !apierrors.IsNotFound(err) { | ||
log.Error(err, "unable to fetch workspace") | ||
} | ||
// We'll ignore not-found errors, since they can't be fixed by an immediate | ||
// requeue (we'll need to wait for a new notification), and we can get them | ||
// on deleted requests. | ||
// On any other error, let the controller requeue an event with exponential | ||
// backoff. | ||
return ctrl.Result{}, client.IgnoreNotFound(err) | ||
} | ||
|
||
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) { | ||
// Workspace has already been marked as timed out. | ||
// Return and don't requeue another reconciliation. | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
// The workspace hasn't timed out yet. After this point, we always | ||
// want to requeue a reconciliation after the configured interval. | ||
defer func() { | ||
result.RequeueAfter = r.reconcileInterval | ||
}() | ||
|
||
timedout := r.isWorkspaceTimedOut(&workspace) | ||
if timedout == "" { | ||
// Hasn't timed out. | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
// Workspace timed out, set Timeout condition. | ||
log.Info("Workspace timed out", "reason", timedout) | ||
workspace.Status.Conditions = wsk8s.AddUniqueCondition(workspace.Status.Conditions, metav1.Condition{ | ||
Type: string(workspacev1.WorkspaceConditionTimeout), | ||
Status: metav1.ConditionTrue, | ||
LastTransitionTime: metav1.Now(), | ||
Reason: "TimedOut", | ||
Message: timedout, | ||
}) | ||
|
||
if err = r.Client.Status().Update(ctx, &workspace); err != nil { | ||
log.Error(err, "Failed to update workspace status with Timeout condition") | ||
return ctrl.Result{}, err | ||
} | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
type timeoutActivity string | ||
|
||
const ( | ||
activityInit timeoutActivity = "initialization" | ||
activityStartup timeoutActivity = "startup" | ||
activityCreatingContainers timeoutActivity = "creating containers" | ||
activityPullingImages timeoutActivity = "pulling images" | ||
activityRunningHeadless timeoutActivity = "running the headless workspace" | ||
activityNone timeoutActivity = "period of inactivity" | ||
activityMaxLifetime timeoutActivity = "maximum lifetime" | ||
activityClosed timeoutActivity = "after being closed" | ||
activityInterrupted timeoutActivity = "workspace interruption" | ||
activityStopping timeoutActivity = "stopping" | ||
activityBackup timeoutActivity = "backup" | ||
) | ||
|
||
// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in. | ||
// This function does NOT use the Timeout condition, but rather is used to set that condition in the first place. | ||
func (r *TimeoutReconciler) isWorkspaceTimedOut(ws *workspacev1.Workspace) (reason string) { | ||
timeouts := r.Config.Timeouts | ||
phase := ws.Status.Phase | ||
|
||
decide := func(start time.Time, timeout util.Duration, activity timeoutActivity) string { | ||
td := time.Duration(timeout) | ||
inactivity := time.Since(start) | ||
if inactivity < td { | ||
return "" | ||
} | ||
|
||
return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td)) | ||
} | ||
|
||
start := ws.ObjectMeta.CreationTimestamp.Time | ||
lastActivity := r.activity.GetLastActivity(ws.Name) | ||
isClosed := wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed)) | ||
|
||
switch phase { | ||
case workspacev1.WorkspacePhasePending: | ||
return decide(start, timeouts.Initialization, activityInit) | ||
|
||
case workspacev1.WorkspacePhaseInitializing: | ||
return decide(start, timeouts.TotalStartup, activityStartup) | ||
|
||
case workspacev1.WorkspacePhaseCreating: | ||
activity := activityCreatingContainers | ||
// TODO: | ||
// if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE { | ||
// activity = activityPullingImages | ||
// } | ||
return decide(start, timeouts.TotalStartup, activity) | ||
|
||
case workspacev1.WorkspacePhaseRunning: | ||
// First check is always for the max lifetime | ||
if msg := decide(start, timeouts.MaxLifetime, activityMaxLifetime); msg != "" { | ||
return msg | ||
} | ||
|
||
timeout := timeouts.RegularWorkspace | ||
if customTimeout := ws.Spec.Timeout.Time; customTimeout != nil { | ||
timeout = util.Duration(customTimeout.Duration) | ||
} | ||
activity := activityNone | ||
if ws.Status.Headless { | ||
timeout = timeouts.HeadlessWorkspace | ||
lastActivity = &start | ||
activity = activityRunningHeadless | ||
} else if lastActivity == nil { | ||
// The workspace is up and running, but the user has never produced any activity, OR the controller | ||
// has restarted and not yet received a heartbeat for this workspace (since heartbeats are stored | ||
// in-memory and reset on restart). | ||
// First check whether the controller has restarted during this workspace's lifetime. | ||
// If it has, use the FirstUserActivity condition to determine whether there had already been any user activity | ||
// before the controller restart. | ||
// If the controller started before the workspace, then the user hasn't produced any activity yet. | ||
if r.ctrlStartTime.After(start) && wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFirstUserActivity)) { | ||
// The controller restarted during this workspace's lifetime, and the workspace has had activity before the restart, | ||
// so the last activity has been lost on restart. Therefore, "reset" the timeout and measure only since the controller startup time. | ||
start = r.ctrlStartTime | ||
} else { | ||
// This workspace hasn't had any user activity yet (also not before a potential controller restart). | ||
// So check for a startup timeout, and measure since workspace creation time. | ||
timeout = timeouts.TotalStartup | ||
} | ||
return decide(start, timeout, activityNone) | ||
} else if isClosed { | ||
return decide(*lastActivity, timeouts.AfterClose, activityClosed) | ||
} | ||
return decide(*lastActivity, timeout, activity) | ||
|
||
case workspacev1.WorkspacePhaseStopping: | ||
if isWorkspaceBeingDeleted(ws) && !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) { | ||
WVerlaek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted. | ||
// We basically don't expect a workspace to be in content finalization before it's been deleted. | ||
return decide(ws.DeletionTimestamp.Time, timeouts.ContentFinalization, activityBackup) | ||
} else if !isWorkspaceBeingDeleted(ws) { | ||
// workspaces that have not been deleted have never timed out | ||
return "" | ||
} else { | ||
return decide(ws.DeletionTimestamp.Time, timeouts.Stopping, activityStopping) | ||
} | ||
|
||
default: | ||
// The only other phases we can be in is stopped which is pointless to time out | ||
return "" | ||
} | ||
} | ||
|
||
func formatDuration(d time.Duration) string { | ||
d = d.Round(time.Minute) | ||
h := d / time.Hour | ||
d -= h * time.Hour | ||
m := d / time.Minute | ||
return fmt.Sprintf("%02dh%02dm", h, m) | ||
} | ||
|
||
// SetupWithManager sets up the controller with the Manager. | ||
func (r *TimeoutReconciler) SetupWithManager(mgr ctrl.Manager) error { | ||
maxConcurrentReconciles := r.Config.TimeoutMaxConcurrentReconciles | ||
if maxConcurrentReconciles <= 0 { | ||
maxConcurrentReconciles = 1 | ||
} | ||
|
||
return ctrl.NewControllerManagedBy(mgr). | ||
WithOptions(controller.Options{MaxConcurrentReconciles: maxConcurrentReconciles}). | ||
For(&workspacev1.Workspace{}). | ||
Complete(r) | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.