Skip to content

Commit

Permalink
Handle Unknown -> Ready members
Browse files Browse the repository at this point in the history
  • Loading branch information
timuthy committed Jun 4, 2021
1 parent 82fce48 commit 134958d
Show file tree
Hide file tree
Showing 14 changed files with 363 additions and 105 deletions.
2 changes: 2 additions & 0 deletions api/v1alpha1/etcd_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ type EtcdMemberStatus struct {
Name string `json:"name"`
// ID is the ID of the etcd member.
ID string `json:"id"`
// PodRef is the reference to the Pod which hosts the etcd member.
PodRef corev1.LocalObjectReference `json:"podRef"`
// Role is the role in the etcd cluster, either `Member` or `Learner`.
Role EtcdRole `json:"role"`
// Status of the condition, one of True, False, Unknown.
Expand Down
1 change: 1 addition & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions config/crd/bases/druid.gardener.cloud_etcds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,15 @@ spec:
name:
description: Name is the name of the etcd member.
type: string
podRef:
description: PodRef is the reference to the Pod which hosts
the etcd member.
properties:
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
type: object
reason:
description: The reason for the condition's last transition.
type: string
Expand All @@ -565,6 +574,7 @@ spec:
- lastTransitionTime
- lastUpdateTime
- name
- podRef
- reason
- role
- status
Expand Down
13 changes: 11 additions & 2 deletions controllers/config/custodian.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ import "time"

// EtcdCustodianController contains configuration for the etcd custodian controller.
type EtcdCustodianController struct {
EtcdStaleMemberThreshold time.Duration
SyncPeriod time.Duration
// EtcdMember holds configuration related to etcd members.
EtcdMember EtcdMemberConfig
// SyncPeriod is the duration after which re-enqueuing happens.
SyncPeriod time.Duration
}

type EtcdMemberConfig struct {
// EtcdMemberUnknownThreshold is the duration after which a etcd member's state is considered `Unknown`.
EtcdMemberUnknownThreshold time.Duration
// EtcdMemberUnknownThreshold is the duration after which a etcd member's state is considered `NotReady`.
EtcdMemberNotReadyThreshold time.Duration
}
9 changes: 6 additions & 3 deletions controllers/controllers_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/gardener/etcd-druid/controllers/config"
controllersconfig "github.com/gardener/etcd-druid/controllers/config"

"github.com/gardener/gardener/pkg/utils/test"
. "github.com/onsi/ginkgo"
Expand Down Expand Up @@ -110,8 +110,11 @@ var _ = BeforeSuite(func(done Done) {
err = er.SetupWithManager(mgr, 1, true)
Expect(err).NotTo(HaveOccurred())

custodian := NewEtcdCustodian(mgr, config.EtcdCustodianController{
EtcdStaleMemberThreshold: 1 * time.Minute,
custodian := NewEtcdCustodian(mgr, controllersconfig.EtcdCustodianController{
EtcdMember: controllersconfig.EtcdMemberConfig{
EtcdMemberUnknownThreshold: 1 * time.Minute,
EtcdMemberNotReadyThreshold: 1 * time.Minute,
},
})

err = custodian.SetupWithManager(mgrCtx, mgr, 1)
Expand Down
10 changes: 5 additions & 5 deletions controllers/etcd_custodian_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/source"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/gardener/etcd-druid/controllers/config"
controllersconfig "github.com/gardener/etcd-druid/controllers/config"
"github.com/gardener/etcd-druid/pkg/health/status"
druidmapper "github.com/gardener/etcd-druid/pkg/mapper"
druidpredicates "github.com/gardener/etcd-druid/pkg/predicate"
Expand All @@ -48,11 +48,11 @@ type EtcdCustodian struct {
client.Client
Scheme *runtime.Scheme
logger logr.Logger
config config.EtcdCustodianController
config controllersconfig.EtcdCustodianController
}

// NewEtcdCustodian creates a new EtcdCustodian object
func NewEtcdCustodian(mgr manager.Manager, config config.EtcdCustodianController) *EtcdCustodian {
func NewEtcdCustodian(mgr manager.Manager, config controllersconfig.EtcdCustodianController) *EtcdCustodian {
return &EtcdCustodian{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Expand Down Expand Up @@ -94,8 +94,8 @@ func (ec *EtcdCustodian) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
return ctrl.Result{}, err
}

statusCheck := status.NewChecker(ec.config)
if err := statusCheck.Check(ctx, &etcd.Status); err != nil {
statusCheck := status.NewChecker(ec.Client, ec.config)
if err := statusCheck.Check(ctx, etcd); err != nil {
logger.Error(err, "Error executing status checks")
return ctrl.Result{}, err
}
Expand Down
17 changes: 11 additions & 6 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/gardener/etcd-druid/controllers"
"github.com/gardener/etcd-druid/controllers/config"
controllersconfig "github.com/gardener/etcd-druid/controllers/config"

"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
Expand Down Expand Up @@ -57,7 +57,8 @@ func main() {
custodianSyncPeriod time.Duration
ignoreOperationAnnotation bool

etcdStaleMemberThreshold time.Duration
etcdMemberUnknownThreshold time.Duration
etcdMemberNotReadyThreshold time.Duration

// TODO: migrate default to `leases` in one of the next releases
defaultLeaderElectionResourceLock = resourcelock.ConfigMapsLeasesResourceLock
Expand All @@ -75,7 +76,8 @@ func main() {
flag.StringVar(&leaderElectionResourceLock, "leader-election-resource-lock", defaultLeaderElectionResourceLock, "Which resource type to use for leader election. "+
"Supported options are 'endpoints', 'configmaps', 'leases', 'endpointsleases' and 'configmapsleases'.")
flag.BoolVar(&ignoreOperationAnnotation, "ignore-operation-annotation", true, "Ignore the operation annotation or not.")
flag.DurationVar(&etcdStaleMemberThreshold, "etcd-member-threshold", 1*time.Minute, "Threshold after which an etcd member status is considered unknown if no heartbeat happened.")
flag.DurationVar(&etcdMemberUnknownThreshold, "etcd-member-unknown-threshold", 60*time.Second, "Threshold after which an etcd member status is considered unknown if no heartbeat happened.")
flag.DurationVar(&etcdMemberNotReadyThreshold, "etcd-member-notready-threshold", 5*time.Minute, "Threshold after which an etcd member is considered not ready if the status was unknown before.")

flag.Parse()

Expand Down Expand Up @@ -107,9 +109,12 @@ func main() {
os.Exit(1)
}

custodian := controllers.NewEtcdCustodian(mgr, config.EtcdCustodianController{
EtcdStaleMemberThreshold: etcdStaleMemberThreshold,
SyncPeriod: custodianSyncPeriod,
custodian := controllers.NewEtcdCustodian(mgr, controllersconfig.EtcdCustodianController{
EtcdMember: controllersconfig.EtcdMemberConfig{
EtcdMemberUnknownThreshold: etcdMemberUnknownThreshold,
EtcdMemberNotReadyThreshold: etcdMemberNotReadyThreshold,
},
SyncPeriod: custodianSyncPeriod,
})

if err := custodian.SetupWithManager(ctx, mgr, custodianWorkers); err != nil {
Expand Down
63 changes: 52 additions & 11 deletions pkg/health/etcdmember/check_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,83 @@
package etcdmember

import (
"context"
"time"

"github.com/gardener/etcd-druid/controllers/config"
kutil "github.com/gardener/gardener/pkg/utils/kubernetes"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"sigs.k8s.io/controller-runtime/pkg/client"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
controllersconfig "github.com/gardener/etcd-druid/controllers/config"
)

type readyCheck struct {
etcdStaleMemberThreshold time.Duration
memberConfig controllersconfig.EtcdMemberConfig
cl client.Client
}

// TimeNow is the function used by this check to get the current time.
var TimeNow = time.Now

func (r *readyCheck) Check(status druidv1alpha1.EtcdStatus) []Result {
func (r *readyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) []Result {
var (
results []Result
threshold = TimeNow().UTC().Add(-1 * r.etcdStaleMemberThreshold)
checkTime = TimeNow().UTC()
)

for _, etcd := range status.Members {
if etcd.LastUpdateTime.Time.Before(threshold) {
for _, member := range etcd.Status.Members {
// Check if status must be changed from Unknown to NotReady.
if member.Status == druidv1alpha1.EtcdMemeberStatusUnknown &&
member.LastTransitionTime.Time.Add(r.memberConfig.EtcdMemberNotReadyThreshold).Before(checkTime) {
results = append(results, &result{
id: etcd.ID,
status: druidv1alpha1.EtcdMemeberStatusUnknown,
reason: "UnknownMemberStatus",
id: member.ID,
status: druidv1alpha1.EtcdMemeberStatusNotReady,
reason: "UnkownStateTimeout",
})
continue
}

// Skip if status is not already Unknown and LastUpdateTime is within grace period.
if !member.LastUpdateTime.Time.Add(r.memberConfig.EtcdMemberUnknownThreshold).Before(checkTime) {
continue
}

// If pod is not running or cannot be found then we deduce that the status is NotReady.
ready, err := r.checkPodIsRunning(ctx, etcd.Namespace, member)
if (err == nil && !ready) || apierrors.IsNotFound(err) {
results = append(results, &result{
id: member.ID,
status: druidv1alpha1.EtcdMemeberStatusNotReady,
reason: "PodNotRunning",
})
continue
}

// For every other reason the status is Unknown.
results = append(results, &result{
id: member.ID,
status: druidv1alpha1.EtcdMemeberStatusUnknown,
reason: "UnknownMemberStatus",
})
}

return results
}

func (r *readyCheck) checkPodIsRunning(ctx context.Context, namespace string, member druidv1alpha1.EtcdMemberStatus) (bool, error) {
pod := &corev1.Pod{}
if err := r.cl.Get(ctx, kutil.Key(namespace, member.PodRef.Name), pod); err != nil {
return false, err
}
return pod.Status.Phase == corev1.PodRunning, nil
}

// ReadyCheck returns a check for the "Ready" condition.
func ReadyCheck(config config.EtcdCustodianController) Checker {
func ReadyCheck(cl client.Client, config controllersconfig.EtcdCustodianController) Checker {
return &readyCheck{
etcdStaleMemberThreshold: config.EtcdStaleMemberThreshold,
cl: cl,
memberConfig: config.EtcdMember,
}
}
Loading

0 comments on commit 134958d

Please sign in to comment.