Skip to content

[ws-manager-mk2] Maintenance mode #16702

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions components/ws-manager-api/go/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -569,3 +569,7 @@ type CpuResourceLimit struct {
MinLimit string `json:"min"`
BurstLimit string `json:"burst"`
}

type MaintenanceConfig struct {
Enabled bool `json:"enabled"`
}
104 changes: 104 additions & 0 deletions components/ws-manager-mk2/controllers/maintenance_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License.AGPL.txt in the project root for license information.

package controllers

import (
"context"
"encoding/json"

"github.com/gitpod-io/gitpod/ws-manager/api/config"
"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

const (
LabelMaintenance = "gitpod.io/maintenanceConfig"
configMapName = "ws-manager-mk2-maintenance-mode"
)

func NewMaintenanceReconciler(c client.Client) (*MaintenanceReconciler, error) {
return &MaintenanceReconciler{
Client: c,
// Enable by default, until we observe the ConfigMap with the actual value.
// Prevents a race on startup where the workspace reconciler might run before
// we observe the maintenance mode ConfigMap. Better be safe and prevent
// reconciliation of that workspace until it's certain maintenance mode is
// not enabled.
enabled: true,
}, nil
}

type MaintenanceReconciler struct {
client.Client

enabled bool
}

func (r *MaintenanceReconciler) IsEnabled() bool {
return r.enabled
}

//+kubebuilder:rbac:groups=core,resources=configmap,verbs=get;list;watch

func (r *MaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx).WithValues("configMap", req.NamespacedName)

if req.Name != configMapName {
log.Info("ignoring unexpected ConfigMap")
return ctrl.Result{}, nil
}

var cm corev1.ConfigMap
if err := r.Get(ctx, req.NamespacedName, &cm); err != nil {
if errors.IsNotFound(err) {
// ConfigMap does not exist, disable maintenance mode.
r.setEnabled(log, false)
return ctrl.Result{}, nil
}

log.Error(err, "unable to fetch configmap")
return ctrl.Result{}, err
}

configJson, ok := cm.Data["config.json"]
if !ok {
log.Info("missing config.json, setting maintenance mode as disabled")
r.setEnabled(log, false)
return ctrl.Result{}, nil
}

var cfg config.MaintenanceConfig
if err := json.Unmarshal([]byte(configJson), &cfg); err != nil {
log.Error(err, "failed to unmarshal maintenance config, setting maintenance mode as disabled")
r.setEnabled(log, false)
return ctrl.Result{}, nil
}

r.setEnabled(log, cfg.Enabled)
return ctrl.Result{}, nil
}

func (r *MaintenanceReconciler) setEnabled(log logr.Logger, enabled bool) {
if enabled == r.enabled {
// Nothing to do.
return
}

r.enabled = enabled
log.Info("maintenance mode state change", "enabled", enabled)
}

func (r *MaintenanceReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
Named("maintenance").
// The controller manager filters watch events only to ConfigMaps with the LabelMaintenance label set to "true".
// See components/ws-manager-mk2/main.go's NewCache function in the manager options.
For(&corev1.ConfigMap{}).
Complete(r)
}
10 changes: 9 additions & 1 deletion components/ws-manager-mk2/controllers/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())

conf := newTestConfig()
wsReconciler, err := NewWorkspaceReconciler(k8sManager.GetClient(), k8sManager.GetScheme(), &conf, metrics.Registry)
wsReconciler, err := NewWorkspaceReconciler(k8sManager.GetClient(), k8sManager.GetScheme(), &conf, metrics.Registry, &fakeMaintenance{enabled: false})
wsMetrics = wsReconciler.metrics
Expect(err).ToNot(HaveOccurred())
Expect(wsReconciler.SetupWithManager(k8sManager)).To(Succeed())
Expand Down Expand Up @@ -148,6 +148,14 @@ func newTestConfig() config.Configuration {
}
}

type fakeMaintenance struct {
enabled bool
}

func (f *fakeMaintenance) IsEnabled() bool {
return f.enabled
}

var _ = AfterSuite(func() {
cancel()
By("tearing down the test environment")
Expand Down
18 changes: 14 additions & 4 deletions components/ws-manager-mk2/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"

wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
config "github.com/gitpod-io/gitpod/ws-manager/api/config"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
"github.com/prometheus/client_golang/prometheus"
Expand All @@ -31,13 +32,15 @@ const (
metricsWorkspaceSubsystem = "ws_manager_mk2"
// kubernetesOperationTimeout is the time we give Kubernetes operations in general.
kubernetesOperationTimeout = 5 * time.Second
maintenanceRequeue = 1 * time.Minute
)

func NewWorkspaceReconciler(c client.Client, scheme *runtime.Scheme, cfg *config.Configuration, reg prometheus.Registerer) (*WorkspaceReconciler, error) {
func NewWorkspaceReconciler(c client.Client, scheme *runtime.Scheme, cfg *config.Configuration, reg prometheus.Registerer, maintenance maintenance.Maintenance) (*WorkspaceReconciler, error) {
reconciler := &WorkspaceReconciler{
Client: c,
Scheme: scheme,
Config: cfg,
Client: c,
Scheme: scheme,
Config: cfg,
maintenance: maintenance,
}

metrics, err := newControllerMetrics(reconciler)
Expand All @@ -57,6 +60,7 @@ type WorkspaceReconciler struct {

Config *config.Configuration
metrics *controllerMetrics
maintenance maintenance.Maintenance
OnReconcile func(ctx context.Context, ws *workspacev1.Workspace)
}

Expand Down Expand Up @@ -94,6 +98,12 @@ func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
}

log.Info("reconciling workspace", "ws", req.NamespacedName)
if r.maintenance.IsEnabled() {
// Don't reconcile workspaces in maintenance mode, to prevent Pod creation and deletion.
// Requeue after some time to ensure we do still reconcile this workspace when
// maintenance mode ends.
return ctrl.Result{RequeueAfter: maintenanceRequeue}, nil
}

var workspacePods corev1.PodList
err := r.List(ctx, &workspacePods, client.InNamespace(req.Namespace), client.MatchingFields{wsOwnerKey: req.Name})
Expand Down
32 changes: 28 additions & 4 deletions components/ws-manager-mk2/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,17 @@ import (
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/credentials/insecure"
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"

grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
Expand All @@ -42,6 +46,7 @@ import (

"github.com/gitpod-io/gitpod/ws-manager-mk2/controllers"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
imgproxy "github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/proxy"
"github.com/gitpod-io/gitpod/ws-manager-mk2/service"
//+kubebuilder:scaffold:imports
Expand Down Expand Up @@ -105,13 +110,28 @@ func main() {
LeaderElection: enableLeaderElection,
LeaderElectionID: "ws-manager-mk2-leader.gitpod.io",
Namespace: cfg.Manager.Namespace,
NewCache: func(conf *rest.Config, opts cache.Options) (cache.Cache, error) {
// Only watch the maintenance mode ConfigMap.
opts.SelectorsByObject = cache.SelectorsByObject{
&corev1.ConfigMap{}: cache.ObjectSelector{
Label: labels.SelectorFromSet(labels.Set{controllers.LabelMaintenance: "true"}),
},
}
return cache.New(conf, opts)
},
})
if err != nil {
setupLog.Error(err, "unable to start manager")
os.Exit(1)
}

reconciler, err := controllers.NewWorkspaceReconciler(mgr.GetClient(), mgr.GetScheme(), &cfg.Manager, metrics.Registry)
maintenance, err := controllers.NewMaintenanceReconciler(mgr.GetClient())
if err != nil {
setupLog.Error(err, "unable to create maintenance controller", "controller", "Maintenance")
os.Exit(1)
}

reconciler, err := controllers.NewWorkspaceReconciler(mgr.GetClient(), mgr.GetScheme(), &cfg.Manager, metrics.Registry, maintenance)
if err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Workspace")
os.Exit(1)
Expand All @@ -124,7 +144,7 @@ func main() {
os.Exit(1)
}

wsmanService, err := setupGRPCService(cfg, mgr.GetClient(), activity)
wsmanService, err := setupGRPCService(cfg, mgr.GetClient(), activity, maintenance)
if err != nil {
setupLog.Error(err, "unable to start manager service")
os.Exit(1)
Expand All @@ -139,6 +159,10 @@ func main() {
setupLog.Error(err, "unable to setup timeout controller with manager", "controller", "Timeout")
os.Exit(1)
}
if err = maintenance.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to setup maintenance controller with manager", "controller", "Maintenance")
os.Exit(1)
}

// if err = (&workspacev1.Workspace{}).SetupWebhookWithManager(mgr); err != nil {
// setupLog.Error(err, "unable to create webhook", "webhook", "Workspace")
Expand All @@ -163,7 +187,7 @@ func main() {
}
}

func setupGRPCService(cfg *config.ServiceConfiguration, k8s client.Client, activity *activity.WorkspaceActivity) (*service.WorkspaceManagerServer, error) {
func setupGRPCService(cfg *config.ServiceConfiguration, k8s client.Client, activity *activity.WorkspaceActivity, maintenance maintenance.Maintenance) (*service.WorkspaceManagerServer, error) {
// TODO(cw): remove use of common-go/log

if len(cfg.RPCServer.RateLimits) > 0 {
Expand Down Expand Up @@ -219,7 +243,7 @@ func setupGRPCService(cfg *config.ServiceConfiguration, k8s client.Client, activ
imgbldr.RegisterImageBuilderServer(grpcServer, imgproxy.ImageBuilder{D: imgbldr.NewImageBuilderClient(conn)})
}

srv := service.NewWorkspaceManagerServer(k8s, &cfg.Manager, metrics.Registry, activity)
srv := service.NewWorkspaceManagerServer(k8s, &cfg.Manager, metrics.Registry, activity, maintenance)

grpc_prometheus.Register(grpcServer)
wsmanapi.RegisterWorkspaceManagerServer(grpcServer, srv)
Expand Down
12 changes: 12 additions & 0 deletions components/ws-manager-mk2/pkg/maintenance/maintenance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License.AGPL.txt in the project root for license information.

package maintenance

// Maintenance is used to check whether ws-manager-mk2 is in maintenance mode,
// which prevents pod creation/deletion and snapshots being taken, such that
// the cluster can be updated in-place.
type Maintenance interface {
IsEnabled() bool
}
33 changes: 24 additions & 9 deletions components/ws-manager-mk2/service/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/gitpod-io/gitpod/common-go/tracing"
"github.com/gitpod-io/gitpod/common-go/util"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
wsmanapi "github.com/gitpod-io/gitpod/ws-manager/api"
"github.com/gitpod-io/gitpod/ws-manager/api/config"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
Expand All @@ -53,26 +54,28 @@ const (
stopWorkspaceImmediatelyGracePeriod = 1 * time.Second
)

func NewWorkspaceManagerServer(clnt client.Client, cfg *config.Configuration, reg prometheus.Registerer, activity *activity.WorkspaceActivity) *WorkspaceManagerServer {
func NewWorkspaceManagerServer(clnt client.Client, cfg *config.Configuration, reg prometheus.Registerer, activity *activity.WorkspaceActivity, maintenance maintenance.Maintenance) *WorkspaceManagerServer {
metrics := newWorkspaceMetrics()
reg.MustRegister(metrics)

return &WorkspaceManagerServer{
Client: clnt,
Config: cfg,
metrics: metrics,
activity: activity,
Client: clnt,
Config: cfg,
metrics: metrics,
activity: activity,
maintenance: maintenance,
subs: subscriptions{
subscribers: make(map[string]chan *wsmanapi.SubscribeResponse),
},
}
}

type WorkspaceManagerServer struct {
Client client.Client
Config *config.Configuration
metrics *workspaceMetrics
activity *activity.WorkspaceActivity
Client client.Client
Config *config.Configuration
metrics *workspaceMetrics
activity *activity.WorkspaceActivity
maintenance maintenance.Maintenance

subs subscriptions
wsmanapi.UnimplementedWorkspaceManagerServer
Expand All @@ -93,6 +96,10 @@ func (wsm *WorkspaceManagerServer) StartWorkspace(ctx context.Context, req *wsma
tracing.ApplyOWI(span, owi)
defer tracing.FinishSpan(span, &err)

if wsm.maintenance.IsEnabled() {
return &wsmanapi.StartWorkspaceResponse{}, status.Error(codes.FailedPrecondition, "under maintenance")
}

if err := validateStartWorkspaceRequest(req); err != nil {
return nil, err
}
Expand Down Expand Up @@ -329,6 +336,10 @@ func (wsm *WorkspaceManagerServer) StopWorkspace(ctx context.Context, req *wsman
tracing.ApplyOWI(span, owi)
defer tracing.FinishSpan(span, &err)

if wsm.maintenance.IsEnabled() {
return &wsmanapi.StopWorkspaceResponse{}, status.Error(codes.FailedPrecondition, "under maintenance")
}

gracePeriod := stopWorkspaceNormallyGracePeriod
if req.Policy == wsmanapi.StopWorkspacePolicy_IMMEDIATELY {
span.LogKV("policy", "immediately")
Expand Down Expand Up @@ -563,6 +574,10 @@ func (wsm *WorkspaceManagerServer) TakeSnapshot(ctx context.Context, req *wsmana
tracing.ApplyOWI(span, log.OWI("", "", req.Id))
defer tracing.FinishSpan(span, &err)

if wsm.maintenance.IsEnabled() {
return &wsmanapi.TakeSnapshotResponse{}, status.Error(codes.FailedPrecondition, "under maintenance")
}

var ws workspacev1.Workspace
err = wsm.Client.Get(ctx, types.NamespacedName{Namespace: wsm.Config.Namespace, Name: req.Id}, &ws)
if errors.IsNotFound(err) {
Expand Down
Loading