Skip to content

Commit 25220bb

Browse files
authored
[ws-manager-mk2] Maintenance mode (#16702)
* [ws-manager-mk2] Maintenance mode reconciler * [ws-manager-mk2] Check for maintenance mode * [ws-manager-mk2] Default to maintenance mode on startup * [ws-manager-mk2] Disable maintenance on unmarshal failure
1 parent 5beb606 commit 25220bb

File tree

9 files changed

+218
-18
lines changed

9 files changed

+218
-18
lines changed

components/ws-manager-api/go/config/config.go

+4
Original file line numberDiff line numberDiff line change
@@ -569,3 +569,7 @@ type CpuResourceLimit struct {
569569
MinLimit string `json:"min"`
570570
BurstLimit string `json:"burst"`
571571
}
572+
573+
type MaintenanceConfig struct {
574+
Enabled bool `json:"enabled"`
575+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package controllers
6+
7+
import (
8+
"context"
9+
"encoding/json"
10+
11+
"github.com/gitpod-io/gitpod/ws-manager/api/config"
12+
"github.com/go-logr/logr"
13+
corev1 "k8s.io/api/core/v1"
14+
"k8s.io/apimachinery/pkg/api/errors"
15+
ctrl "sigs.k8s.io/controller-runtime"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
17+
"sigs.k8s.io/controller-runtime/pkg/log"
18+
)
19+
20+
const (
21+
LabelMaintenance = "gitpod.io/maintenanceConfig"
22+
configMapName = "ws-manager-mk2-maintenance-mode"
23+
)
24+
25+
func NewMaintenanceReconciler(c client.Client) (*MaintenanceReconciler, error) {
26+
return &MaintenanceReconciler{
27+
Client: c,
28+
// Enable by default, until we observe the ConfigMap with the actual value.
29+
// Prevents a race on startup where the workspace reconciler might run before
30+
// we observe the maintenance mode ConfigMap. Better be safe and prevent
31+
// reconciliation of that workspace until it's certain maintenance mode is
32+
// not enabled.
33+
enabled: true,
34+
}, nil
35+
}
36+
37+
type MaintenanceReconciler struct {
38+
client.Client
39+
40+
enabled bool
41+
}
42+
43+
func (r *MaintenanceReconciler) IsEnabled() bool {
44+
return r.enabled
45+
}
46+
47+
//+kubebuilder:rbac:groups=core,resources=configmap,verbs=get;list;watch
48+
49+
func (r *MaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
50+
log := log.FromContext(ctx).WithValues("configMap", req.NamespacedName)
51+
52+
if req.Name != configMapName {
53+
log.Info("ignoring unexpected ConfigMap")
54+
return ctrl.Result{}, nil
55+
}
56+
57+
var cm corev1.ConfigMap
58+
if err := r.Get(ctx, req.NamespacedName, &cm); err != nil {
59+
if errors.IsNotFound(err) {
60+
// ConfigMap does not exist, disable maintenance mode.
61+
r.setEnabled(log, false)
62+
return ctrl.Result{}, nil
63+
}
64+
65+
log.Error(err, "unable to fetch configmap")
66+
return ctrl.Result{}, err
67+
}
68+
69+
configJson, ok := cm.Data["config.json"]
70+
if !ok {
71+
log.Info("missing config.json, setting maintenance mode as disabled")
72+
r.setEnabled(log, false)
73+
return ctrl.Result{}, nil
74+
}
75+
76+
var cfg config.MaintenanceConfig
77+
if err := json.Unmarshal([]byte(configJson), &cfg); err != nil {
78+
log.Error(err, "failed to unmarshal maintenance config, setting maintenance mode as disabled")
79+
r.setEnabled(log, false)
80+
return ctrl.Result{}, nil
81+
}
82+
83+
r.setEnabled(log, cfg.Enabled)
84+
return ctrl.Result{}, nil
85+
}
86+
87+
func (r *MaintenanceReconciler) setEnabled(log logr.Logger, enabled bool) {
88+
if enabled == r.enabled {
89+
// Nothing to do.
90+
return
91+
}
92+
93+
r.enabled = enabled
94+
log.Info("maintenance mode state change", "enabled", enabled)
95+
}
96+
97+
func (r *MaintenanceReconciler) SetupWithManager(mgr ctrl.Manager) error {
98+
return ctrl.NewControllerManagedBy(mgr).
99+
Named("maintenance").
100+
// The controller manager filters watch events only to ConfigMaps with the LabelMaintenance label set to "true".
101+
// See components/ws-manager-mk2/main.go's NewCache function in the manager options.
102+
For(&corev1.ConfigMap{}).
103+
Complete(r)
104+
}

components/ws-manager-mk2/controllers/suite_test.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ var _ = BeforeSuite(func() {
102102
Expect(err).ToNot(HaveOccurred())
103103

104104
conf := newTestConfig()
105-
wsReconciler, err := NewWorkspaceReconciler(k8sManager.GetClient(), k8sManager.GetScheme(), &conf, metrics.Registry)
105+
wsReconciler, err := NewWorkspaceReconciler(k8sManager.GetClient(), k8sManager.GetScheme(), &conf, metrics.Registry, &fakeMaintenance{enabled: false})
106106
wsMetrics = wsReconciler.metrics
107107
Expect(err).ToNot(HaveOccurred())
108108
Expect(wsReconciler.SetupWithManager(k8sManager)).To(Succeed())
@@ -148,6 +148,14 @@ func newTestConfig() config.Configuration {
148148
}
149149
}
150150

151+
type fakeMaintenance struct {
152+
enabled bool
153+
}
154+
155+
func (f *fakeMaintenance) IsEnabled() bool {
156+
return f.enabled
157+
}
158+
151159
var _ = AfterSuite(func() {
152160
cancel()
153161
By("tearing down the test environment")

components/ws-manager-mk2/controllers/workspace_controller.go

+14-4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"sigs.k8s.io/controller-runtime/pkg/log"
2222

2323
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
24+
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
2425
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
2526
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
2627
"github.com/prometheus/client_golang/prometheus"
@@ -31,13 +32,15 @@ const (
3132
metricsWorkspaceSubsystem = "ws_manager_mk2"
3233
// kubernetesOperationTimeout is the time we give Kubernetes operations in general.
3334
kubernetesOperationTimeout = 5 * time.Second
35+
maintenanceRequeue = 1 * time.Minute
3436
)
3537

36-
func NewWorkspaceReconciler(c client.Client, scheme *runtime.Scheme, cfg *config.Configuration, reg prometheus.Registerer) (*WorkspaceReconciler, error) {
38+
func NewWorkspaceReconciler(c client.Client, scheme *runtime.Scheme, cfg *config.Configuration, reg prometheus.Registerer, maintenance maintenance.Maintenance) (*WorkspaceReconciler, error) {
3739
reconciler := &WorkspaceReconciler{
38-
Client: c,
39-
Scheme: scheme,
40-
Config: cfg,
40+
Client: c,
41+
Scheme: scheme,
42+
Config: cfg,
43+
maintenance: maintenance,
4144
}
4245

4346
metrics, err := newControllerMetrics(reconciler)
@@ -57,6 +60,7 @@ type WorkspaceReconciler struct {
5760

5861
Config *config.Configuration
5962
metrics *controllerMetrics
63+
maintenance maintenance.Maintenance
6064
OnReconcile func(ctx context.Context, ws *workspacev1.Workspace)
6165
}
6266

@@ -94,6 +98,12 @@ func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
9498
}
9599

96100
log.Info("reconciling workspace", "ws", req.NamespacedName)
101+
if r.maintenance.IsEnabled() {
102+
// Don't reconcile workspaces in maintenance mode, to prevent Pod creation and deletion.
103+
// Requeue after some time to ensure we do still reconcile this workspace when
104+
// maintenance mode ends.
105+
return ctrl.Result{RequeueAfter: maintenanceRequeue}, nil
106+
}
97107

98108
var workspacePods corev1.PodList
99109
err := r.List(ctx, &workspacePods, client.InNamespace(req.Namespace), client.MatchingFields{wsOwnerKey: req.Name})

components/ws-manager-mk2/main.go

+28-4
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,17 @@ import (
1818
"google.golang.org/grpc/credentials"
1919
"google.golang.org/grpc/credentials/insecure"
2020
_ "k8s.io/client-go/plugin/pkg/client/auth"
21+
"k8s.io/client-go/rest"
2122

2223
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
2324
"github.com/prometheus/client_golang/prometheus"
25+
corev1 "k8s.io/api/core/v1"
26+
"k8s.io/apimachinery/pkg/labels"
2427
"k8s.io/apimachinery/pkg/runtime"
2528
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2629
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
2730
ctrl "sigs.k8s.io/controller-runtime"
31+
"sigs.k8s.io/controller-runtime/pkg/cache"
2832
"sigs.k8s.io/controller-runtime/pkg/client"
2933
"sigs.k8s.io/controller-runtime/pkg/healthz"
3034
"sigs.k8s.io/controller-runtime/pkg/log/zap"
@@ -42,6 +46,7 @@ import (
4246

4347
"github.com/gitpod-io/gitpod/ws-manager-mk2/controllers"
4448
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
49+
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
4550
imgproxy "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/proxy"
4651
"github.com/gitpod-io/gitpod/ws-manager-mk2/service"
4752
//+kubebuilder:scaffold:imports
@@ -105,13 +110,28 @@ func main() {
105110
LeaderElection: enableLeaderElection,
106111
LeaderElectionID: "ws-manager-mk2-leader.gitpod.io",
107112
Namespace: cfg.Manager.Namespace,
113+
NewCache: func(conf *rest.Config, opts cache.Options) (cache.Cache, error) {
114+
// Only watch the maintenance mode ConfigMap.
115+
opts.SelectorsByObject = cache.SelectorsByObject{
116+
&corev1.ConfigMap{}: cache.ObjectSelector{
117+
Label: labels.SelectorFromSet(labels.Set{controllers.LabelMaintenance: "true"}),
118+
},
119+
}
120+
return cache.New(conf, opts)
121+
},
108122
})
109123
if err != nil {
110124
setupLog.Error(err, "unable to start manager")
111125
os.Exit(1)
112126
}
113127

114-
reconciler, err := controllers.NewWorkspaceReconciler(mgr.GetClient(), mgr.GetScheme(), &cfg.Manager, metrics.Registry)
128+
maintenance, err := controllers.NewMaintenanceReconciler(mgr.GetClient())
129+
if err != nil {
130+
setupLog.Error(err, "unable to create maintenance controller", "controller", "Maintenance")
131+
os.Exit(1)
132+
}
133+
134+
reconciler, err := controllers.NewWorkspaceReconciler(mgr.GetClient(), mgr.GetScheme(), &cfg.Manager, metrics.Registry, maintenance)
115135
if err != nil {
116136
setupLog.Error(err, "unable to create controller", "controller", "Workspace")
117137
os.Exit(1)
@@ -124,7 +144,7 @@ func main() {
124144
os.Exit(1)
125145
}
126146

127-
wsmanService, err := setupGRPCService(cfg, mgr.GetClient(), activity)
147+
wsmanService, err := setupGRPCService(cfg, mgr.GetClient(), activity, maintenance)
128148
if err != nil {
129149
setupLog.Error(err, "unable to start manager service")
130150
os.Exit(1)
@@ -139,6 +159,10 @@ func main() {
139159
setupLog.Error(err, "unable to setup timeout controller with manager", "controller", "Timeout")
140160
os.Exit(1)
141161
}
162+
if err = maintenance.SetupWithManager(mgr); err != nil {
163+
setupLog.Error(err, "unable to setup maintenance controller with manager", "controller", "Maintenance")
164+
os.Exit(1)
165+
}
142166

143167
// if err = (&workspacev1.Workspace{}).SetupWebhookWithManager(mgr); err != nil {
144168
// setupLog.Error(err, "unable to create webhook", "webhook", "Workspace")
@@ -163,7 +187,7 @@ func main() {
163187
}
164188
}
165189

166-
func setupGRPCService(cfg *config.ServiceConfiguration, k8s client.Client, activity *activity.WorkspaceActivity) (*service.WorkspaceManagerServer, error) {
190+
func setupGRPCService(cfg *config.ServiceConfiguration, k8s client.Client, activity *activity.WorkspaceActivity, maintenance maintenance.Maintenance) (*service.WorkspaceManagerServer, error) {
167191
// TODO(cw): remove use of common-go/log
168192

169193
if len(cfg.RPCServer.RateLimits) > 0 {
@@ -219,7 +243,7 @@ func setupGRPCService(cfg *config.ServiceConfiguration, k8s client.Client, activ
219243
imgbldr.RegisterImageBuilderServer(grpcServer, imgproxy.ImageBuilder{D: imgbldr.NewImageBuilderClient(conn)})
220244
}
221245

222-
srv := service.NewWorkspaceManagerServer(k8s, &cfg.Manager, metrics.Registry, activity)
246+
srv := service.NewWorkspaceManagerServer(k8s, &cfg.Manager, metrics.Registry, activity, maintenance)
223247

224248
grpc_prometheus.Register(grpcServer)
225249
wsmanapi.RegisterWorkspaceManagerServer(grpcServer, srv)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package maintenance
6+
7+
// Maintenance is used to check whether ws-manager-mk2 is in maintenance mode,
8+
// which prevents pod creation/deletion and snapshots being taken, such that
9+
// the cluster can be updated in-place.
10+
type Maintenance interface {
11+
IsEnabled() bool
12+
}

components/ws-manager-mk2/service/manager.go

+24-9
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/gitpod-io/gitpod/common-go/tracing"
2929
"github.com/gitpod-io/gitpod/common-go/util"
3030
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
31+
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
3132
wsmanapi "github.com/gitpod-io/gitpod/ws-manager/api"
3233
"github.com/gitpod-io/gitpod/ws-manager/api/config"
3334
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
@@ -53,26 +54,28 @@ const (
5354
stopWorkspaceImmediatelyGracePeriod = 1 * time.Second
5455
)
5556

56-
func NewWorkspaceManagerServer(clnt client.Client, cfg *config.Configuration, reg prometheus.Registerer, activity *activity.WorkspaceActivity) *WorkspaceManagerServer {
57+
func NewWorkspaceManagerServer(clnt client.Client, cfg *config.Configuration, reg prometheus.Registerer, activity *activity.WorkspaceActivity, maintenance maintenance.Maintenance) *WorkspaceManagerServer {
5758
metrics := newWorkspaceMetrics()
5859
reg.MustRegister(metrics)
5960

6061
return &WorkspaceManagerServer{
61-
Client: clnt,
62-
Config: cfg,
63-
metrics: metrics,
64-
activity: activity,
62+
Client: clnt,
63+
Config: cfg,
64+
metrics: metrics,
65+
activity: activity,
66+
maintenance: maintenance,
6567
subs: subscriptions{
6668
subscribers: make(map[string]chan *wsmanapi.SubscribeResponse),
6769
},
6870
}
6971
}
7072

7173
type WorkspaceManagerServer struct {
72-
Client client.Client
73-
Config *config.Configuration
74-
metrics *workspaceMetrics
75-
activity *activity.WorkspaceActivity
74+
Client client.Client
75+
Config *config.Configuration
76+
metrics *workspaceMetrics
77+
activity *activity.WorkspaceActivity
78+
maintenance maintenance.Maintenance
7679

7780
subs subscriptions
7881
wsmanapi.UnimplementedWorkspaceManagerServer
@@ -93,6 +96,10 @@ func (wsm *WorkspaceManagerServer) StartWorkspace(ctx context.Context, req *wsma
9396
tracing.ApplyOWI(span, owi)
9497
defer tracing.FinishSpan(span, &err)
9598

99+
if wsm.maintenance.IsEnabled() {
100+
return &wsmanapi.StartWorkspaceResponse{}, status.Error(codes.FailedPrecondition, "under maintenance")
101+
}
102+
96103
if err := validateStartWorkspaceRequest(req); err != nil {
97104
return nil, err
98105
}
@@ -329,6 +336,10 @@ func (wsm *WorkspaceManagerServer) StopWorkspace(ctx context.Context, req *wsman
329336
tracing.ApplyOWI(span, owi)
330337
defer tracing.FinishSpan(span, &err)
331338

339+
if wsm.maintenance.IsEnabled() {
340+
return &wsmanapi.StopWorkspaceResponse{}, status.Error(codes.FailedPrecondition, "under maintenance")
341+
}
342+
332343
gracePeriod := stopWorkspaceNormallyGracePeriod
333344
if req.Policy == wsmanapi.StopWorkspacePolicy_IMMEDIATELY {
334345
span.LogKV("policy", "immediately")
@@ -563,6 +574,10 @@ func (wsm *WorkspaceManagerServer) TakeSnapshot(ctx context.Context, req *wsmana
563574
tracing.ApplyOWI(span, log.OWI("", "", req.Id))
564575
defer tracing.FinishSpan(span, &err)
565576

577+
if wsm.maintenance.IsEnabled() {
578+
return &wsmanapi.TakeSnapshotResponse{}, status.Error(codes.FailedPrecondition, "under maintenance")
579+
}
580+
566581
var ws workspacev1.Workspace
567582
err = wsm.Client.Get(ctx, types.NamespacedName{Namespace: wsm.Config.Namespace, Name: req.Id}, &ws)
568583
if errors.IsNotFound(err) {

0 commit comments

Comments
 (0)