Skip to content

Commit bc5bb99

Browse files
committed
[ws-manager-mk2] Workspace timeouts
1 parent 3863ce7 commit bc5bb99

File tree

6 files changed

+272
-4
lines changed

6 files changed

+272
-4
lines changed

components/ws-manager-api/go/config/config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ type Configuration struct {
125125
WorkspaceClasses map[string]*WorkspaceClass `json:"workspaceClass"`
126126
// DebugWorkspacePod adds extra finalizer to workspace to prevent it from shutting down. Helps to debug.
127127
DebugWorkspacePod bool `json:"debugWorkspacePod,omitempty"`
128+
// TimeoutMaxConcurrentReconciles configures the max amount of concurrent workspace reconciliations on
129+
// the timeout controller.
130+
TimeoutMaxConcurrentReconciles int `json:"timeoutMaxConcurrentReconciles,omitempty"`
128131
}
129132

130133
type WorkspaceClass struct {

components/ws-manager-mk2/controllers/status.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,3 +287,8 @@ func isPodBeingDeleted(pod *corev1.Pod) bool {
287287
// if the pod is being deleted the only marker we have is that the deletionTimestamp is set
288288
return pod.ObjectMeta.DeletionTimestamp != nil
289289
}
290+
291+
// isWorkspaceBeingDeleted returns true if the workspace resource is currently being deleted.
292+
func isWorkspaceBeingDeleted(ws *workspacev1.Workspace) bool {
293+
return ws.ObjectMeta.DeletionTimestamp != nil
294+
}
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License-AGPL.txt in the project root for license information.
4+
5+
package controllers
6+
7+
import (
8+
"context"
9+
"fmt"
10+
"time"
11+
12+
apierrors "k8s.io/apimachinery/pkg/api/errors"
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
ctrl "sigs.k8s.io/controller-runtime"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
"sigs.k8s.io/controller-runtime/pkg/controller"
17+
"sigs.k8s.io/controller-runtime/pkg/log"
18+
19+
"github.com/gitpod-io/gitpod/common-go/util"
20+
wsactivity "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
21+
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
22+
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
23+
)
24+
25+
func NewTimeoutReconciler(c client.Client, cfg config.Configuration, activity *wsactivity.WorkspaceActivity) (*TimeoutReconciler, error) {
26+
reconcileInterval := time.Duration(cfg.HeartbeatInterval)
27+
// Reconcile interval is half the heartbeat interval to catch timed out workspaces in time.
28+
// See https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem why we need this.
29+
reconcileInterval /= 2
30+
31+
return &TimeoutReconciler{
32+
Client: c,
33+
Config: cfg,
34+
activity: activity,
35+
reconcileInterval: reconcileInterval,
36+
ctrlStartTime: time.Now().UTC(),
37+
}, nil
38+
}
39+
40+
// TimeoutReconciler reconciles workspace timeouts. This is a separate reconciler, as it
41+
// always requeues events for existing workspaces such that timeouts are checked on (at least)
42+
// a specified interval. The reconcile loop should therefore be light-weight as it's repeatedly
43+
// reconciling all workspaces in the cluster.
44+
type TimeoutReconciler struct {
45+
client.Client
46+
47+
Config config.Configuration
48+
activity *wsactivity.WorkspaceActivity
49+
reconcileInterval time.Duration
50+
ctrlStartTime time.Time
51+
}
52+
53+
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete
54+
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch
55+
56+
// Reconcile will check the given workspace for timing out. When done, a new event gets
57+
// requeued automatically to ensure the workspace gets reconciled at least every reconcileInterval.
58+
func (r *TimeoutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
59+
log := log.FromContext(ctx).WithValues("ws", req.NamespacedName)
60+
61+
var workspace workspacev1.Workspace
62+
if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil {
63+
if !apierrors.IsNotFound(err) {
64+
log.Error(err, "unable to fetch workspace")
65+
}
66+
// We'll ignore not-found errors, since they can't be fixed by an immediate
67+
// requeue (we'll need to wait for a new notification), and we can get them
68+
// on deleted requests.
69+
// On any other error, let the controller requeue an event with exponential
70+
// backoff.
71+
return ctrl.Result{}, client.IgnoreNotFound(err)
72+
}
73+
74+
if conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) {
75+
// Workspace has already been marked as timed out.
76+
// Return and don't requeue another reconciliation.
77+
return ctrl.Result{}, nil
78+
}
79+
80+
// The workspace hasn't timed out yet. After this point, we always
81+
// want to requeue a reconciliation after the configured interval.
82+
defer func() {
83+
result.RequeueAfter = r.reconcileInterval
84+
}()
85+
86+
timedout, err := r.isWorkspaceTimedOut(&workspace)
87+
if err != nil {
88+
log.Error(err, "failed to check for workspace timeout")
89+
return ctrl.Result{}, err
90+
}
91+
92+
if timedout == "" {
93+
// Hasn't timed out.
94+
return ctrl.Result{}, nil
95+
}
96+
97+
// Workspace timed out, set Timeout condition.
98+
log.Info("Workspace timed out", "reason", timedout)
99+
workspace.Status.Conditions = AddUniqueCondition(workspace.Status.Conditions, metav1.Condition{
100+
Type: string(workspacev1.WorkspaceConditionTimeout),
101+
Status: metav1.ConditionTrue,
102+
LastTransitionTime: metav1.Now(),
103+
Reason: "TimedOut",
104+
Message: timedout,
105+
})
106+
107+
if err = r.Client.Status().Update(ctx, &workspace); err != nil {
108+
log.Error(err, "Failed to update workspace status with Timeout condition")
109+
return ctrl.Result{}, err
110+
}
111+
return ctrl.Result{}, nil
112+
}
113+
114+
type timeoutActivity string
115+
116+
const (
117+
activityInit timeoutActivity = "initialization"
118+
activityStartup timeoutActivity = "startup"
119+
activityCreatingContainers timeoutActivity = "creating containers"
120+
activityPullingImages timeoutActivity = "pulling images"
121+
activityRunningHeadless timeoutActivity = "running the headless workspace"
122+
activityNone timeoutActivity = "period of inactivity"
123+
activityMaxLifetime timeoutActivity = "maximum lifetime"
124+
activityClosed timeoutActivity = "after being closed"
125+
activityInterrupted timeoutActivity = "workspace interruption"
126+
activityStopping timeoutActivity = "stopping"
127+
activityBackup timeoutActivity = "backup"
128+
)
129+
130+
// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.
131+
// This function does NOT use the Timeout condition, but rather is used to set that condition in the first place.
132+
func (r *TimeoutReconciler) isWorkspaceTimedOut(ws *workspacev1.Workspace) (reason string, err error) {
133+
timeouts := r.Config.Timeouts
134+
phase := ws.Status.Phase
135+
136+
decide := func(start time.Time, timeout util.Duration, activity timeoutActivity) (string, error) {
137+
td := time.Duration(timeout)
138+
inactivity := time.Since(start)
139+
if inactivity < td {
140+
return "", nil
141+
}
142+
143+
return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td)), nil
144+
}
145+
146+
start := ws.ObjectMeta.CreationTimestamp.Time
147+
lastActivity := r.activity.GetLastActivity(ws.Name)
148+
isClosed := conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed))
149+
150+
switch phase {
151+
case workspacev1.WorkspacePhasePending:
152+
return decide(start, timeouts.Initialization, activityInit)
153+
154+
case workspacev1.WorkspacePhaseInitializing:
155+
return decide(start, timeouts.TotalStartup, activityStartup)
156+
157+
case workspacev1.WorkspacePhaseCreating:
158+
activity := activityCreatingContainers
159+
// TODO:
160+
// if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {
161+
// activity = activityPullingImages
162+
// }
163+
return decide(start, timeouts.TotalStartup, activity)
164+
165+
case workspacev1.WorkspacePhaseRunning:
166+
// First check is always for the max lifetime
167+
if msg, err := decide(start, timeouts.MaxLifetime, activityMaxLifetime); msg != "" {
168+
return msg, err
169+
}
170+
171+
timeout := timeouts.RegularWorkspace
172+
if ctv := ws.Spec.Timeout.Time; ctv != nil {
173+
timeout = util.Duration(ctv.Duration)
174+
}
175+
activity := activityNone
176+
if ws.Status.Headless {
177+
timeout = timeouts.HeadlessWorkspace
178+
lastActivity = &start
179+
activity = activityRunningHeadless
180+
} else if lastActivity == nil {
181+
// The workspace is up and running, but the user has never produced any activity, OR the controller
182+
// has restarted and not yet received a heartbeat for this workspace (since heartbeats are stored
183+
// in-memory and reset on restart).
184+
// First check whether the controller has restarted during this workspace's lifetime.
185+
// If it has, use the FirstUserActivity condition to determine whether there had already been any user activity
186+
// before the controller restart.
187+
// If the controller started before the workspace, then the user hasn't produced any activity yet.
188+
if r.ctrlStartTime.After(start) && conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFirstUserActivity)) {
189+
// The controller restarted during this workspace's lifetime, and the workspace has had activity before the restart,
190+
// so the last activity has been lost on restart. Therefore, "reset" the timeout and measure only since the controller startup time.
191+
start = r.ctrlStartTime
192+
} else {
193+
// This workspace hasn't had any user activity yet (also not before a potential controller restart).
194+
// So check for a startup timeout, and measure since workspace creation time.
195+
timeout = timeouts.TotalStartup
196+
}
197+
return decide(start, timeout, activityNone)
198+
} else if isClosed {
199+
return decide(*lastActivity, timeouts.AfterClose, activityClosed)
200+
}
201+
return decide(*lastActivity, timeout, activity)
202+
203+
case workspacev1.WorkspacePhaseStopping:
204+
if isWorkspaceBeingDeleted(ws) && conditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)) {
205+
// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted.
206+
// We basically don't expect a workspace to be in content finalization before it's been deleted.
207+
return decide(ws.DeletionTimestamp.Time, timeouts.ContentFinalization, activityBackup)
208+
} else if !isWorkspaceBeingDeleted(ws) {
209+
// workspaces that have not been deleted have never timed out
210+
return "", nil
211+
} else {
212+
return decide(ws.DeletionTimestamp.Time, timeouts.Stopping, activityStopping)
213+
}
214+
215+
default:
216+
// The only other phases we can be in is stopped which is pointless to time out
217+
return "", nil
218+
}
219+
}
220+
221+
func formatDuration(d time.Duration) string {
222+
d = d.Round(time.Minute)
223+
h := d / time.Hour
224+
d -= h * time.Hour
225+
m := d / time.Minute
226+
return fmt.Sprintf("%02dh%02dm", h, m)
227+
}
228+
229+
// SetupWithManager sets up the controller with the Manager.
230+
func (r *TimeoutReconciler) SetupWithManager(mgr ctrl.Manager) error {
231+
maxConcurrentReconciles := r.Config.TimeoutMaxConcurrentReconciles
232+
if maxConcurrentReconciles <= 0 {
233+
maxConcurrentReconciles = 1
234+
}
235+
236+
return ctrl.NewControllerManagedBy(mgr).
237+
WithOptions(controller.Options{MaxConcurrentReconciles: maxConcurrentReconciles}).
238+
For(&workspacev1.Workspace{}).
239+
Complete(r)
240+
}

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,15 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
193193
return ctrl.Result{Requeue: true}, err
194194
}
195195

196+
// if the workspace timed out, delete it
197+
case conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) && !isPodBeingDeleted(pod):
198+
err := r.Client.Delete(ctx, pod)
199+
if errors.IsNotFound(err) {
200+
// pod is gone - nothing to do here
201+
} else {
202+
return ctrl.Result{Requeue: true}, err
203+
}
204+
196205
// if the content initialization failed, delete the pod
197206
case conditionWithStatusAndReson(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, "InitializationFailure") && !isPodBeingDeleted(pod):
198207
err := r.Client.Delete(ctx, pod)

components/ws-manager-mk2/main.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@ func main() {
105105
}
106106

107107
activity := &activity.WorkspaceActivity{}
108+
timeoutReconciler, err := controllers.NewTimeoutReconciler(mgr.GetClient(), cfg.Manager, activity)
109+
if err != nil {
110+
setupLog.Error(err, "unable to create timeout controller", "controller", "Timeout")
111+
os.Exit(1)
112+
}
113+
108114
wsmanService, err := setupGRPCService(cfg, mgr.GetClient(), activity)
109115
if err != nil {
110116
setupLog.Error(err, "unable to start manager service")
@@ -113,7 +119,11 @@ func main() {
113119

114120
reconciler.OnReconcile = wsmanService.OnWorkspaceReconcile
115121
if err = reconciler.SetupWithManager(mgr); err != nil {
116-
setupLog.Error(err, "unable to create controller", "controller", "Workspace")
122+
setupLog.Error(err, "unable to setup workspace controller with manager", "controller", "Workspace")
123+
os.Exit(1)
124+
}
125+
if err = timeoutReconciler.SetupWithManager(mgr); err != nil {
126+
setupLog.Error(err, "unable to setup timeout controller with manager", "controller", "Timeout")
117127
os.Exit(1)
118128
}
119129

install/installer/pkg/components/ws-manager-mk2/configmap.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,10 @@ func configmap(ctx *common.RenderContext) ([]runtime.Object, error) {
221221
Interrupted: util.Duration(5 * time.Minute),
222222
},
223223
//EventTraceLog: "", // todo(sje): make conditional based on config
224-
ReconnectionInterval: util.Duration(30 * time.Second),
225-
RegistryFacadeHost: fmt.Sprintf("reg.%s:%d", ctx.Config.Domain, common.RegistryFacadeServicePort),
226-
WorkspaceCACertSecret: customCASecret,
224+
ReconnectionInterval: util.Duration(30 * time.Second),
225+
RegistryFacadeHost: fmt.Sprintf("reg.%s:%d", ctx.Config.Domain, common.RegistryFacadeServicePort),
226+
WorkspaceCACertSecret: customCASecret,
227+
TimeoutMaxConcurrentReconciles: 5,
227228
},
228229
Content: struct {
229230
Storage storageconfig.StorageConfig `json:"storage"`

0 commit comments

Comments
 (0)