Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add annotation that triggers a volume recreation #13

Merged
merged 1 commit into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/v1alpha1/types_etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ const (
Periodic CompactionMode = "periodic"
// Revision is a constant to set auto-compaction-mode 'revision' for revision number based retention.
Revision CompactionMode = "revision"

// RecreateVolumesAnnotation can be set to true which triggers a one-time recreation of all persistent volumes.
RecreateVolumesAnnotation = "druid.gardener.cloud/recreate-volumes"
// RecreatedAtAnnotation is used internally to track when the last PVC recreation was triggered.
RecreatedAtAnnotation = "druid.gardener.cloud/recreated-at"
)

// +genclient
Expand Down
1 change: 1 addition & 0 deletions charts/druid/templates/druid-clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ rules:
- get
- list
- watch
- delete
- apiGroups:
- coordination.k8s.io
resourceNames:
Expand Down
9 changes: 7 additions & 2 deletions controllers/etcd/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package etcd
import (
"context"
"fmt"
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
ctrlutils "github.com/gardener/etcd-druid/controllers/utils"
Expand Down Expand Up @@ -175,15 +176,19 @@ func (r *Reconciler) reconcile(ctx context.Context, etcd *druidv1alpha1.Etcd) (c
}, err
}

if err = r.removeOperationAnnotation(ctx, logger, etcd); err != nil {
if err := r.updateAnnotations(ctx, logger, etcd); err != nil {
if apierrors.IsNotFound(err) {
return ctrl.Result{}, nil
}
return ctrl.Result{
Requeue: true,
RequeueAfter: 10 * time.Second,
}, err
}

if requeue, err := r.checkStatefulSetProgress(ctx, logger, etcd); requeue || err != nil {
return ctrl.Result{RequeueAfter: 10 * time.Second}, err
}

result := r.reconcileEtcd(ctx, logger, etcd)
if result.err != nil {
if updateEtcdErr := r.updateEtcdErrorStatus(ctx, etcd, result); updateEtcdErr != nil {
Expand Down
6 changes: 5 additions & 1 deletion controllers/etcd/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@ func (r *Reconciler) RegisterWithManager(mgr ctrl.Manager, ignoreOperationAnnota
// BuildPredicate builds the predicates used by Etcd controller.
func BuildPredicate(ignoreOperationAnnotation bool) predicate.Predicate {
if ignoreOperationAnnotation {
return predicate.GenerationChangedPredicate{}
return predicate.Or(
predicate.GenerationChangedPredicate{},
druidpredicates.HasRecreateVolumesAnnotation(),
)
}

return predicate.Or(
druidpredicates.HasOperationAnnotation(),
druidpredicates.HasRecreateVolumesAnnotation(),
druidpredicates.LastOperationNotSuccessful(),
predicateutils.IsDeleting(),
)
Expand Down
72 changes: 72 additions & 0 deletions controllers/etcd/volume.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package etcd

import (
"context"
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/gardener/etcd-druid/pkg/utils"
v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/go-logr/logr"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// updateAnnotations removes the operation annotation and sets the recreatedAt annotation, if a volume recreation was
// triggered.
func (r *Reconciler) updateAnnotations(ctx context.Context, logger logr.Logger, etcd *druidv1alpha1.Etcd) error {
update := false
patch := client.MergeFrom(etcd.DeepCopy())
if _, ok := etcd.Annotations[druidv1alpha1.RecreateVolumesAnnotation]; ok {
update = true
logger.Info("Removing recreate-volumes annotation", "annotation", druidv1alpha1.RecreateVolumesAnnotation)
delete(etcd.Annotations, druidv1alpha1.RecreateVolumesAnnotation)
// we only support volume recreation if the etcd has backups enabled
if etcd.Spec.Backup.Store != nil && etcd.Spec.Backup.Store.Provider != nil && len(*etcd.Spec.Backup.Store.Provider) > 0 {
etcd.Annotations[druidv1alpha1.RecreatedAtAnnotation] = time.Now().UTC().Format(time.RFC3339Nano)
} else {
r.recorder.Event(etcd, corev1.EventTypeWarning, "SkippingVolumeRecreation", "will not recreate volumes, because backup is not enabled")
}
}

if _, ok := etcd.Annotations[v1beta1constants.GardenerOperation]; ok {
update = true
logger.Info("Removing operation annotation", "namespace", etcd.Namespace, "name", etcd.Name, "annotation", v1beta1constants.GardenerOperation)
delete(etcd.Annotations, v1beta1constants.GardenerOperation)
}

if update {
return r.Patch(ctx, etcd, patch)
}
return nil
}

// checkStatefulSetProgress checks if the recreatedAt annotation has already been processed. It recreated-at annotation
// is changed, we should only continue when the STS is healthy.
func (r *Reconciler) checkStatefulSetProgress(ctx context.Context, logger logr.Logger, etcd *druidv1alpha1.Etcd) (requeue bool, err error) {
recreatedEtcd := etcd.Annotations[druidv1alpha1.RecreatedAtAnnotation]
if recreatedEtcd == "" {
return false, nil
}
sts := &appsv1.StatefulSet{ObjectMeta: metav1.ObjectMeta{
Name: etcd.Name,
Namespace: etcd.Namespace,
}}
if err := r.Client.Get(ctx, client.ObjectKeyFromObject(sts), sts); err != nil {
return false, client.IgnoreNotFound(err)
}
recreatedSTS := sts.Spec.Template.Annotations[druidv1alpha1.RecreatedAtAnnotation]
if recreatedEtcd == recreatedSTS {
return false, nil
}
if ready, _ := utils.IsStatefulSetReady(etcd.Spec.Replicas, sts); ready {
return false, nil
}
msg := "recreatedAt annotation needs to be propagated, but the Statefulset is not ready yet"
r.recorder.Event(etcd, corev1.EventTypeWarning, "StatefulSetNotReady", msg)
logger.Info(msg)
return true, nil
}
44 changes: 44 additions & 0 deletions controllers/predicate/predicate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,50 @@ var _ = Describe("Druid Predicate", func() {
})
})

Describe("#HasRecreateVolumesAnnotation", func() {
var pred predicate.Predicate

JustBeforeEach(func() {
pred = HasRecreateVolumesAnnotation()
})

Context("when has no operation annotation", func() {
BeforeEach(func() {
obj = &druidv1alpha1.Etcd{
ObjectMeta: metav1.ObjectMeta{
Annotations: make(map[string]string),
},
}
})

It("should return false", func() {
gomega.Expect(pred.Create(createEvent)).To(gomega.BeFalse())
gomega.Expect(pred.Update(updateEvent)).To(gomega.BeFalse())
gomega.Expect(pred.Delete(deleteEvent)).To(gomega.BeTrue())
gomega.Expect(pred.Generic(genericEvent)).To(gomega.BeFalse())
})
})

Context("when has operation annotation", func() {
BeforeEach(func() {
obj = &druidv1alpha1.Etcd{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
druidv1alpha1.RecreateVolumesAnnotation: "true",
},
},
}
})

It("should return true", func() {
gomega.Expect(pred.Create(createEvent)).To(gomega.BeTrue())
gomega.Expect(pred.Update(updateEvent)).To(gomega.BeTrue())
gomega.Expect(pred.Delete(deleteEvent)).To(gomega.BeTrue())
gomega.Expect(pred.Generic(genericEvent)).To(gomega.BeTrue())
})
})
})

Describe("#OR", func() {
var pred predicate.Predicate

Expand Down
32 changes: 32 additions & 0 deletions controllers/predicate/volume.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package predicate

import (
druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"

"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)

func hasRecreateVolumesAnnotation(obj client.Object) bool {
_, ok := obj.GetAnnotations()[druidv1alpha1.RecreateVolumesAnnotation]
return ok
}

// HasRecreateVolumesAnnotation is a predicate for the recreate volumes annotation.
func HasRecreateVolumesAnnotation() predicate.Predicate {
return predicate.Funcs{
CreateFunc: func(event event.CreateEvent) bool {
return hasRecreateVolumesAnnotation(event.Object)
},
UpdateFunc: func(event event.UpdateEvent) bool {
return hasRecreateVolumesAnnotation(event.ObjectNew)
},
GenericFunc: func(event event.GenericEvent) bool {
return hasRecreateVolumesAnnotation(event.Object)
},
DeleteFunc: func(_ event.DeleteEvent) bool {
return true
},
}
}
55 changes: 55 additions & 0 deletions docs/operations/recreate-volumes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Recreating volumes

In just our fork, we added the ability to recreate volumes, as a one-off-operation. The recreation is triggered by
annotating the etcd CR with `druid.gardener.cloud/recreate-volumes=true`. This triggers the following sequence of
events:

1. The annotation is removed, and a new annotation `druid.gardener.cloud/recreated-at` is added with the current timestamp.
2. All PVCs which match the `.spec.labels` and which are older than this timestamp are deleted
- they will not be actually deleted until the Pod using them restarts
3. The annotation `druid.gardener.cloud/recreated-at` is added to the PodTemplate annotations, triggering a new rollout of the StatefulSet
4. The pods are recreated on-by-one by the statefulset-controller

Unfortunately, etcd-druid uses `podManagementPolicy: Parallel` for the StatefulSet, so if any other change is applied
during the recreation, etcd could be in trouble. There are some safeguards in place, but it is best to ensure there are
no further updates during that time.

## Script for recreating etcd-main and blocking changes at the same time

In our environment, we can block customer changes by setting a `stackit.cloud/readonly` annotation on the shoot:

```bash
#!/usr/bin/env bash

# usage: ./recreate.sh prd project shoot

set -euo pipefail

ENV="$1"
project="$2"
shoot="$3"

gardenctl target --garden "$ENV"
projectNS=$(kubectl get project "$project" -o jsonpath="{.spec.namespace}")

kubectl annotate shoot -n "$projectNS" "$shoot" stackit.cloud/readonly=true
kubectl annotate shoot -n "$projectNS" "$shoot" "stackit.cloud/readonly-message=System Maintenance in Progress"

gardenctl target --garden "$ENV" --project "$project" --shoot "$shoot" --control-plane

kubectl annotate etcd etcd-main druid.gardener.cloud/recreate-volumes=true

echo "> Waiting for STS to be updated"
kubectl wait --for=jsonpath='{.status.updatedReplicas}'=1 sts etcd-main --timeout=1m

echo "> Waiting for STS to be rolled out"
kubectl wait --for=jsonpath='{.status.updatedReplicas}'=3 sts etcd-main --timeout=10m

echo "> Waiting for STS to be ready"
kubectl wait --for=jsonpath='{.status.readyReplicas}'=3 sts etcd-main --timeout=2m

gardenctl target --garden "$ENV"
kubectl annotate shoot -n "$projectNS" "$shoot" stackit.cloud/readonly-
kubectl annotate shoot -n "$projectNS" "$shoot" stackit.cloud/readonly-message-

```
3 changes: 2 additions & 1 deletion pkg/component/etcd/statefulset/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ func (c *component) createDeployFlow(ctx context.Context) (*flow.Flow, error) {
// if sts recreation tasks for peer url tls have already been added then there is no need to additionally add tasks to explicitly handle immutable field updates.
taskID = c.addImmutableFieldUpdateTask(g, sts)
}
taskID = c.addTaintPVCsTask(g, sts, taskID)
}
c.addCreateOrPatchTask(g, sts, taskID)

Expand Down Expand Up @@ -419,7 +420,7 @@ func (c *component) createOrPatch(ctx context.Context, sts *appsv1.StatefulSet,
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Annotations: c.values.Annotations,
Annotations: getPodAnnotations(c.values, sts),
Labels: utils.MergeStringMaps(make(map[string]string), c.values.AdditionalPodLabels, c.values.Labels),
},
Spec: corev1.PodSpec{
Expand Down
3 changes: 3 additions & 0 deletions pkg/component/etcd/statefulset/values.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,7 @@ type Values struct {

// UseEtcdWrapper enables the use of etcd-wrapper image and a compatible version of etcd-backup-restore
UseEtcdWrapper bool

// RecreatedVolumesAt is the timestamp when the last volume recreation was requested.
RecreatedVolumesAt string
}
3 changes: 2 additions & 1 deletion pkg/component/etcd/statefulset/values_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ func GenerateValues(
ConfigMapName: etcd.GetConfigmapName(),
PeerTLSChangedToEnabled: peerTLSChangedToEnabled,

UseEtcdWrapper: useEtcdWrapper,
UseEtcdWrapper: useEtcdWrapper,
RecreatedVolumesAt: etcd.Annotations[druidv1alpha1.RecreatedAtAnnotation],
}

values.EtcdCommandArgs = getEtcdCommandArgs(values)
Expand Down
81 changes: 81 additions & 0 deletions pkg/component/etcd/statefulset/volumes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package statefulset

import (
"context"
"maps"
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"

"github.com/gardener/gardener/pkg/utils/flow"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func (c *component) addTaintPVCsTask(g *flow.Graph, sts *appsv1.StatefulSet, taskIDDependency *flow.TaskID) *flow.TaskID {
if c.values.RecreatedVolumesAt == "" ||
sts.Spec.Template.ObjectMeta.Annotations[druidv1alpha1.RecreatedAtAnnotation] == c.values.RecreatedVolumesAt {

return taskIDDependency
}

recreatedTS, err := time.Parse(time.RFC3339Nano, c.values.RecreatedVolumesAt)
if err != nil {
c.logger.Error(err, "recreated-at annotation does not contain a valid timestamp, will not recreate volumes")
// just ignore invalid timestamps, since that means someone messed with the annotations.
maboehm marked this conversation as resolved.
Show resolved Hide resolved
return taskIDDependency
}

var (
dependencies flow.TaskIDs
)
if taskIDDependency != nil {
dependencies = flow.NewTaskIDs(taskIDDependency)
}

taskID := g.Add(flow.Task{
Name: "taint PersistentVolumeClaims",
Fn: func(ctx context.Context) error {
return c.deletePVCs(ctx, recreatedTS)
},
Dependencies: dependencies,
})
c.logger.Info("added taint PersistentVolumeClaims task to the deploy flow", "taskID", taskID, "namespace", c.values.Namespace)

return &taskID
}

func (c *component) deletePVCs(ctx context.Context, recreatedTS time.Time) error {
pvcList := &corev1.PersistentVolumeClaimList{}
if err := c.client.List(ctx, pvcList, client.InNamespace(c.values.Namespace), client.MatchingLabels(c.values.Labels)); err != nil {
return err
}
// deletes any PVC that is not already deleting and is older than the "recreatedAt" timestamp
for _, pvc := range pvcList.Items {
if !pvc.DeletionTimestamp.IsZero() || pvc.CreationTimestamp.Time.After(recreatedTS) {
continue
}
if err := c.client.Delete(ctx, &pvc); client.IgnoreNotFound(err) != nil {
return err
}

c.logger.Info("deleted old PersistentVolumeClaim", "namespace", c.values.Namespace, "name", c.values.Name, "pvc", pvc.Name)
}
return nil
}

// getPodAnnotations sets the annotations to val.Annotations, preserving any existing RecreatedAtAnnotation unless
// val.RecreatedAt is set.
func getPodAnnotations(val Values, sts *appsv1.StatefulSet) map[string]string {
res := make(map[string]string, len(val.Annotations))
maps.Copy(res, val.Annotations)
if recreateTS, ok := sts.Spec.Template.ObjectMeta.Annotations[druidv1alpha1.RecreatedAtAnnotation]; ok {
res[druidv1alpha1.RecreatedAtAnnotation] = recreateTS
}
if val.RecreatedVolumesAt != "" {
res[druidv1alpha1.RecreatedAtAnnotation] = val.RecreatedVolumesAt
}

return res
}
Loading