Skip to content

Commit 0c60ae6

Browse files
committed
[ws-manager-mk2] Workspace timeouts
1 parent 2c8c58a commit 0c60ae6

File tree

6 files changed

+268
-4
lines changed

6 files changed

+268
-4
lines changed

components/ws-manager-api/go/config/config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ type Configuration struct {
125125
WorkspaceClasses map[string]*WorkspaceClass `json:"workspaceClass"`
126126
// DebugWorkspacePod adds extra finalizer to workspace to prevent it from shutting down. Helps to debug.
127127
DebugWorkspacePod bool `json:"debugWorkspacePod,omitempty"`
128+
// TimeoutMaxConcurrentReconciles configures the max amount of concurrent workspace reconciliations on
129+
// the timeout controller.
130+
TimeoutMaxConcurrentReconciles int `json:"timeoutMaxConcurrentReconciles,omitempty"`
128131
}
129132

130133
type WorkspaceClass struct {

components/ws-manager-mk2/controllers/status.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,8 @@ func isPodBeingDeleted(pod *corev1.Pod) bool {
310310
// if the pod is being deleted the only marker we have is that the deletionTimestamp is set
311311
return pod.ObjectMeta.DeletionTimestamp != nil
312312
}
313+
314+
// isWorkspaceBeingDeleted returns true if the workspace resource is currently being deleted.
315+
func isWorkspaceBeingDeleted(ws *workspacev1.Workspace) bool {
316+
return ws.ObjectMeta.DeletionTimestamp != nil
317+
}
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License-AGPL.txt in the project root for license information.
4+
5+
package controllers
6+
7+
import (
8+
"context"
9+
"fmt"
10+
"time"
11+
12+
apierrors "k8s.io/apimachinery/pkg/api/errors"
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
ctrl "sigs.k8s.io/controller-runtime"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
"sigs.k8s.io/controller-runtime/pkg/controller"
17+
"sigs.k8s.io/controller-runtime/pkg/log"
18+
19+
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
20+
"github.com/gitpod-io/gitpod/common-go/util"
21+
wsactivity "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
22+
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
23+
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
24+
)
25+
26+
func NewTimeoutReconciler(c client.Client, cfg config.Configuration, activity *wsactivity.WorkspaceActivity) (*TimeoutReconciler, error) {
27+
reconcileInterval := time.Duration(cfg.HeartbeatInterval)
28+
// Reconcile interval is half the heartbeat interval to catch timed out workspaces in time.
29+
// See https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem why we need this.
30+
reconcileInterval /= 2
31+
32+
return &TimeoutReconciler{
33+
Client: c,
34+
Config: cfg,
35+
activity: activity,
36+
reconcileInterval: reconcileInterval,
37+
ctrlStartTime: time.Now().UTC(),
38+
}, nil
39+
}
40+
41+
// TimeoutReconciler reconciles workspace timeouts. This is a separate reconciler, as it
42+
// always requeues events for existing workspaces such that timeouts are checked on (at least)
43+
// a specified interval. The reconcile loop should therefore be light-weight as it's repeatedly
44+
// reconciling all workspaces in the cluster.
45+
type TimeoutReconciler struct {
46+
client.Client
47+
48+
Config config.Configuration
49+
activity *wsactivity.WorkspaceActivity
50+
reconcileInterval time.Duration
51+
ctrlStartTime time.Time
52+
}
53+
54+
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete
55+
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch
56+
57+
// Reconcile will check the given workspace for timing out. When done, a new event gets
58+
// requeued automatically to ensure the workspace gets reconciled at least every reconcileInterval.
59+
func (r *TimeoutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
60+
log := log.FromContext(ctx).WithValues("ws", req.NamespacedName)
61+
62+
var workspace workspacev1.Workspace
63+
if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil {
64+
if !apierrors.IsNotFound(err) {
65+
log.Error(err, "unable to fetch workspace")
66+
}
67+
// We'll ignore not-found errors, since they can't be fixed by an immediate
68+
// requeue (we'll need to wait for a new notification), and we can get them
69+
// on deleted requests.
70+
// On any other error, let the controller requeue an event with exponential
71+
// backoff.
72+
return ctrl.Result{}, client.IgnoreNotFound(err)
73+
}
74+
75+
if wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) {
76+
// Workspace has already been marked as timed out.
77+
// Return and don't requeue another reconciliation.
78+
return ctrl.Result{}, nil
79+
}
80+
81+
// The workspace hasn't timed out yet. After this point, we always
82+
// want to requeue a reconciliation after the configured interval.
83+
defer func() {
84+
result.RequeueAfter = r.reconcileInterval
85+
}()
86+
87+
timedout := r.isWorkspaceTimedOut(&workspace)
88+
if timedout == "" {
89+
// Hasn't timed out.
90+
return ctrl.Result{}, nil
91+
}
92+
93+
// Workspace timed out, set Timeout condition.
94+
log.Info("Workspace timed out", "reason", timedout)
95+
workspace.Status.Conditions = wsk8s.AddUniqueCondition(workspace.Status.Conditions, metav1.Condition{
96+
Type: string(workspacev1.WorkspaceConditionTimeout),
97+
Status: metav1.ConditionTrue,
98+
LastTransitionTime: metav1.Now(),
99+
Reason: "TimedOut",
100+
Message: timedout,
101+
})
102+
103+
if err = r.Client.Status().Update(ctx, &workspace); err != nil {
104+
log.Error(err, "Failed to update workspace status with Timeout condition")
105+
return ctrl.Result{}, err
106+
}
107+
return ctrl.Result{}, nil
108+
}
109+
110+
type timeoutActivity string
111+
112+
const (
113+
activityInit timeoutActivity = "initialization"
114+
activityStartup timeoutActivity = "startup"
115+
activityCreatingContainers timeoutActivity = "creating containers"
116+
activityPullingImages timeoutActivity = "pulling images"
117+
activityRunningHeadless timeoutActivity = "running the headless workspace"
118+
activityNone timeoutActivity = "period of inactivity"
119+
activityMaxLifetime timeoutActivity = "maximum lifetime"
120+
activityClosed timeoutActivity = "after being closed"
121+
activityInterrupted timeoutActivity = "workspace interruption"
122+
activityStopping timeoutActivity = "stopping"
123+
activityBackup timeoutActivity = "backup"
124+
)
125+
126+
// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.
127+
// This function does NOT use the Timeout condition, but rather is used to set that condition in the first place.
128+
func (r *TimeoutReconciler) isWorkspaceTimedOut(ws *workspacev1.Workspace) (reason string) {
129+
timeouts := r.Config.Timeouts
130+
phase := ws.Status.Phase
131+
132+
decide := func(start time.Time, timeout util.Duration, activity timeoutActivity) string {
133+
td := time.Duration(timeout)
134+
inactivity := time.Since(start)
135+
if inactivity < td {
136+
return ""
137+
}
138+
139+
return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td))
140+
}
141+
142+
start := ws.ObjectMeta.CreationTimestamp.Time
143+
lastActivity := r.activity.GetLastActivity(ws.Name)
144+
isClosed := wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed))
145+
146+
switch phase {
147+
case workspacev1.WorkspacePhasePending:
148+
return decide(start, timeouts.Initialization, activityInit)
149+
150+
case workspacev1.WorkspacePhaseInitializing:
151+
return decide(start, timeouts.TotalStartup, activityStartup)
152+
153+
case workspacev1.WorkspacePhaseCreating:
154+
activity := activityCreatingContainers
155+
// TODO:
156+
// if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {
157+
// activity = activityPullingImages
158+
// }
159+
return decide(start, timeouts.TotalStartup, activity)
160+
161+
case workspacev1.WorkspacePhaseRunning:
162+
// First check is always for the max lifetime
163+
if msg := decide(start, timeouts.MaxLifetime, activityMaxLifetime); msg != "" {
164+
return msg
165+
}
166+
167+
timeout := timeouts.RegularWorkspace
168+
if customTimeout := ws.Spec.Timeout.Time; customTimeout != nil {
169+
timeout = util.Duration(customTimeout.Duration)
170+
}
171+
activity := activityNone
172+
if ws.Status.Headless {
173+
timeout = timeouts.HeadlessWorkspace
174+
lastActivity = &start
175+
activity = activityRunningHeadless
176+
} else if lastActivity == nil {
177+
// The workspace is up and running, but the user has never produced any activity, OR the controller
178+
// has restarted and not yet received a heartbeat for this workspace (since heartbeats are stored
179+
// in-memory and reset on restart).
180+
// First check whether the controller has restarted during this workspace's lifetime.
181+
// If it has, use the FirstUserActivity condition to determine whether there had already been any user activity
182+
// before the controller restart.
183+
// If the controller started before the workspace, then the user hasn't produced any activity yet.
184+
if r.ctrlStartTime.After(start) && wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFirstUserActivity)) {
185+
// The controller restarted during this workspace's lifetime, and the workspace has had activity before the restart,
186+
// so the last activity has been lost on restart. Therefore, "reset" the timeout and measure only since the controller startup time.
187+
start = r.ctrlStartTime
188+
} else {
189+
// This workspace hasn't had any user activity yet (also not before a potential controller restart).
190+
// So check for a startup timeout, and measure since workspace creation time.
191+
timeout = timeouts.TotalStartup
192+
}
193+
return decide(start, timeout, activityNone)
194+
} else if isClosed {
195+
return decide(*lastActivity, timeouts.AfterClose, activityClosed)
196+
}
197+
return decide(*lastActivity, timeout, activity)
198+
199+
case workspacev1.WorkspacePhaseStopping:
200+
if isWorkspaceBeingDeleted(ws) && !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) {
201+
// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted.
202+
// We basically don't expect a workspace to be in content finalization before it's been deleted.
203+
return decide(ws.DeletionTimestamp.Time, timeouts.ContentFinalization, activityBackup)
204+
} else if !isWorkspaceBeingDeleted(ws) {
205+
// workspaces that have not been deleted have never timed out
206+
return ""
207+
} else {
208+
return decide(ws.DeletionTimestamp.Time, timeouts.Stopping, activityStopping)
209+
}
210+
211+
default:
212+
// The only other phases we can be in is stopped which is pointless to time out
213+
return ""
214+
}
215+
}
216+
217+
func formatDuration(d time.Duration) string {
218+
d = d.Round(time.Minute)
219+
h := d / time.Hour
220+
d -= h * time.Hour
221+
m := d / time.Minute
222+
return fmt.Sprintf("%02dh%02dm", h, m)
223+
}
224+
225+
// SetupWithManager sets up the controller with the Manager.
226+
func (r *TimeoutReconciler) SetupWithManager(mgr ctrl.Manager) error {
227+
maxConcurrentReconciles := r.Config.TimeoutMaxConcurrentReconciles
228+
if maxConcurrentReconciles <= 0 {
229+
maxConcurrentReconciles = 1
230+
}
231+
232+
return ctrl.NewControllerManagedBy(mgr).
233+
WithOptions(controller.Options{MaxConcurrentReconciles: maxConcurrentReconciles}).
234+
For(&workspacev1.Workspace{}).
235+
Complete(r)
236+
}

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,15 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
194194
return ctrl.Result{Requeue: true}, err
195195
}
196196

197+
// if the workspace timed out, delete it
198+
case wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) && !isPodBeingDeleted(pod):
199+
err := r.Client.Delete(ctx, pod)
200+
if errors.IsNotFound(err) {
201+
// pod is gone - nothing to do here
202+
} else {
203+
return ctrl.Result{Requeue: true}, err
204+
}
205+
197206
// if the content initialization failed, delete the pod
198207
case wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") && !isPodBeingDeleted(pod):
199208
err := r.Client.Delete(ctx, pod)

components/ws-manager-mk2/main.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ func main() {
102102
}
103103

104104
activity := &activity.WorkspaceActivity{}
105+
timeoutReconciler, err := controllers.NewTimeoutReconciler(mgr.GetClient(), cfg.Manager, activity)
106+
if err != nil {
107+
setupLog.Error(err, "unable to create timeout controller", "controller", "Timeout")
108+
os.Exit(1)
109+
}
110+
105111
wsmanService, err := setupGRPCService(cfg, mgr.GetClient(), activity)
106112
if err != nil {
107113
setupLog.Error(err, "unable to start manager service")
@@ -110,7 +116,11 @@ func main() {
110116

111117
reconciler.OnReconcile = wsmanService.OnWorkspaceReconcile
112118
if err = reconciler.SetupWithManager(mgr); err != nil {
113-
setupLog.Error(err, "unable to create controller", "controller", "Workspace")
119+
setupLog.Error(err, "unable to setup workspace controller with manager", "controller", "Workspace")
120+
os.Exit(1)
121+
}
122+
if err = timeoutReconciler.SetupWithManager(mgr); err != nil {
123+
setupLog.Error(err, "unable to setup timeout controller with manager", "controller", "Timeout")
114124
os.Exit(1)
115125
}
116126

install/installer/pkg/components/ws-manager-mk2/configmap.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,10 @@ func configmap(ctx *common.RenderContext) ([]runtime.Object, error) {
221221
Interrupted: util.Duration(5 * time.Minute),
222222
},
223223
//EventTraceLog: "", // todo(sje): make conditional based on config
224-
ReconnectionInterval: util.Duration(30 * time.Second),
225-
RegistryFacadeHost: fmt.Sprintf("reg.%s:%d", ctx.Config.Domain, common.RegistryFacadeServicePort),
226-
WorkspaceCACertSecret: customCASecret,
224+
ReconnectionInterval: util.Duration(30 * time.Second),
225+
RegistryFacadeHost: fmt.Sprintf("reg.%s:%d", ctx.Config.Domain, common.RegistryFacadeServicePort),
226+
WorkspaceCACertSecret: customCASecret,
227+
TimeoutMaxConcurrentReconciles: 5,
227228
},
228229
Content: struct {
229230
Storage storageconfig.StorageConfig `json:"storage"`

0 commit comments

Comments
 (0)