Skip to content

Commit

Permalink
Merge pull request #591 from ialidzhikov/enh/pdb-check
Browse files Browse the repository at this point in the history
Check for misconfigured PDBs on Node drain and set proper error description
  • Loading branch information
prashanth26 authored Jan 29, 2021
2 parents 6eb440c + 5afdbf0 commit e10d43a
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 4 deletions.
7 changes: 7 additions & 0 deletions kubernetes/deployment/out-of-tree/target-cluster-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,10 @@ rules:
verbs:
- list
- watch
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- watch
- list
1 change: 1 addition & 0 deletions pkg/util/provider/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ func StartControllers(s *options.MCServer,
targetCoreInformerFactory.Core().V1().PersistentVolumes(),
controlCoreInformerFactory.Core().V1().Secrets(),
targetCoreInformerFactory.Core().V1().Nodes(),
targetCoreInformerFactory.Policy().V1beta1().PodDisruptionBudgets(),
machineSharedInformers.MachineClasses(),
machineSharedInformers.Machines(),
recorder,
Expand Down
57 changes: 54 additions & 3 deletions pkg/util/provider/drain/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,15 @@ import (
"github.com/gardener/machine-controller-manager/pkg/util/provider/driver"
api "k8s.io/api/core/v1"
corev1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1beta1"
policyv1beta1 "k8s.io/api/policy/v1beta1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
corelisters "k8s.io/client-go/listers/core/v1"
policylisters "k8s.io/client-go/listers/policy/v1beta1"
"k8s.io/klog"
)

Expand All @@ -63,6 +64,7 @@ type Options struct {
Driver driver.Driver
pvcLister corelisters.PersistentVolumeClaimLister
pvLister corelisters.PersistentVolumeLister
pdbLister policylisters.PodDisruptionBudgetLister
drainStartedOn time.Time
drainEndedOn time.Time
}
Expand Down Expand Up @@ -133,6 +135,7 @@ func NewDrainOptions(
driver driver.Driver,
pvcLister corelisters.PersistentVolumeClaimLister,
pvLister corelisters.PersistentVolumeLister,
pdbLister policylisters.PodDisruptionBudgetLister,
) *Options {

return &Options{
Expand All @@ -151,6 +154,7 @@ func NewDrainOptions(
Driver: driver,
pvcLister: pvcLister,
pvLister: pvLister,
pdbLister: pdbLister,
}

}
Expand Down Expand Up @@ -335,7 +339,7 @@ func (o *Options) evictPod(pod *api.Pod, policyGroupVersion string) error {
gracePeriodSeconds := int64(o.GracePeriodSeconds)
deleteOptions.GracePeriodSeconds = &gracePeriodSeconds
}
eviction := &policy.Eviction{
eviction := &policyv1beta1.Eviction{
TypeMeta: metav1.TypeMeta{
APIVersion: policyGroupVersion,
Kind: EvictionKind,
Expand Down Expand Up @@ -631,7 +635,18 @@ func (o *Options) evictPodsWithPVInternal(attemptEvict bool, pods []*corev1.Pod,

if attemptEvict && apierrors.IsTooManyRequests(err) {
// Pod eviction failed because of PDB violation, we will retry one we are done with this list.
klog.V(3).Info("Pod ", pod.Namespace, "/", pod.Name, " from node ", pod.Spec.NodeName, " couldn't be evicted. This may also occur due to PDB violation. Will be retried. Error:", err)
klog.V(3).Infof("Pod %s/%s couldn't be evicted from node %s. This may also occur due to PDB violation. Will be retried. Error: %v", pod.Namespace, pod.Name, pod.Spec.NodeName, err)

pdb := getPdbForPod(o.pdbLister, pod)
if pdb != nil {
if isMisconfiguredPdb(pdb) {
pdbErr := fmt.Errorf("error while evicting pod %q: pod disruption budget %s/%s is misconfigured and requires zero voluntary evictions",
pod.Name, pdb.Namespace, pdb.Name)
returnCh <- pdbErr
continue
}
}

retryPods = append(retryPods, pod)
continue
} else if apierrors.IsNotFound(err) {
Expand Down Expand Up @@ -840,6 +855,18 @@ func (o *Options) evictPodWithoutPVInternal(attemptEvict bool, pod *corev1.Pod,
return
} else if attemptEvict && apierrors.IsTooManyRequests(err) {
// Pod couldn't be evicted because of PDB violation
klog.V(3).Infof("Pod %s/%s couldn't be evicted from node %s. This may also occur due to PDB violation. Will be retried. Error: %v", pod.Namespace, pod.Name, pod.Spec.NodeName, err)

pdb := getPdbForPod(o.pdbLister, pod)
if pdb != nil {
if isMisconfiguredPdb(pdb) {
pdbErr := fmt.Errorf("error while evicting pod %q: pod disruption budget %s/%s is misconfigured and requires zero voluntary evictions",
pod.Name, pdb.Namespace, pdb.Name)
returnCh <- pdbErr
return
}
}

time.Sleep(PodEvictionRetryInterval)
} else {
returnCh <- fmt.Errorf("error when evicting pod %q: %v scheduled on node %v", pod.Name, err, pod.Spec.NodeName)
Expand Down Expand Up @@ -956,3 +983,27 @@ func (o *Options) RunCordonOrUncordon(desired bool) error {
}
return nil
}

func getPdbForPod(pdbLister policylisters.PodDisruptionBudgetLister, pod *corev1.Pod) *policyv1beta1.PodDisruptionBudget {
// GetPodPodDisruptionBudgets returns an error only if no PodDisruptionBudgets are found.
// We don't return that as an error to the caller.
pdbs, err := pdbLister.GetPodPodDisruptionBudgets(pod)
if err != nil {
klog.V(4).Infof("No PodDisruptionBudgets found for pod %s/%s.", pod.Namespace, pod.Name)
return nil
}

if len(pdbs) > 1 {
klog.Warningf("Pod %s/%s matches multiple PodDisruptionBudgets. Chose %q arbitrarily.", pod.Namespace, pod.Name, pdbs[0].Name)
}

return pdbs[0]
}

func isMisconfiguredPdb(pdb *policyv1beta1.PodDisruptionBudget) bool {
if pdb.ObjectMeta.Generation != pdb.Status.ObservedGeneration {
return false
}

return pdb.Status.ExpectedPods > 0 && pdb.Status.CurrentHealthy >= pdb.Status.ExpectedPods && pdb.Status.PodDisruptionsAllowed == 0
}
9 changes: 8 additions & 1 deletion pkg/util/provider/machinecontroller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@ import (
runtimeutil "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
coreinformers "k8s.io/client-go/informers/core/v1"
policyinformers "k8s.io/client-go/informers/policy/v1beta1"
"k8s.io/client-go/kubernetes"
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1"
policylisters "k8s.io/client-go/listers/policy/v1beta1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
Expand Down Expand Up @@ -72,6 +74,7 @@ func NewController(
pvInformer coreinformers.PersistentVolumeInformer,
secretInformer coreinformers.SecretInformer,
nodeInformer coreinformers.NodeInformer,
pdbInformer policyinformers.PodDisruptionBudgetInformer,
machineClassInformer machineinformers.MachineClassInformer,
machineInformer machineinformers.MachineInformer,
recorder record.EventRecorder,
Expand Down Expand Up @@ -115,12 +118,14 @@ func NewController(
controller.pvcLister = pvcInformer.Lister()
controller.pvLister = pvInformer.Lister()
controller.secretLister = secretInformer.Lister()
controller.pdbLister = pdbInformer.Lister()
controller.machineClassLister = machineClassInformer.Lister()
controller.nodeLister = nodeInformer.Lister()
controller.machineLister = machineInformer.Lister()

// Controller syncs
controller.secretSynced = secretInformer.Informer().HasSynced
controller.pdbSynced = pdbInformer.Informer().HasSynced
controller.machineClassSynced = machineClassInformer.Informer().HasSynced
controller.nodeSynced = nodeInformer.Informer().HasSynced
controller.machineSynced = machineInformer.Informer().HasSynced
Expand Down Expand Up @@ -207,6 +212,7 @@ type controller struct {
pvLister corelisters.PersistentVolumeLister
secretLister corelisters.SecretLister
nodeLister corelisters.NodeLister
pdbLister policylisters.PodDisruptionBudgetLister
machineClassLister machinelisters.MachineClassLister
machineLister machinelisters.MachineLister
// queues
Expand All @@ -218,6 +224,7 @@ type controller struct {
machineSafetyAPIServerQueue workqueue.RateLimitingInterface
// syncs
secretSynced cache.InformerSynced
pdbSynced cache.InformerSynced
nodeSynced cache.InformerSynced
machineClassSynced cache.InformerSynced
machineSynced cache.InformerSynced
Expand All @@ -237,7 +244,7 @@ func (c *controller) Run(workers int, stopCh <-chan struct{}) {
defer c.machineSafetyOrphanVMsQueue.ShutDown()
defer c.machineSafetyAPIServerQueue.ShutDown()

if !cache.WaitForCacheSync(stopCh, c.secretSynced, c.nodeSynced, c.machineClassSynced, c.machineSynced) {
if !cache.WaitForCacheSync(stopCh, c.secretSynced, c.pdbSynced, c.nodeSynced, c.machineClassSynced, c.machineSynced) {
runtimeutil.HandleError(fmt.Errorf("Timed out waiting for caches to sync"))
return
}
Expand Down
1 change: 1 addition & 0 deletions pkg/util/provider/machinecontroller/machine_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,7 @@ func (c *controller) drainNode(deleteMachineRequest *driver.DeleteMachineRequest
c.driver,
c.pvcLister,
c.pvLister,
c.pdbLister,
)
err = drainOptions.RunDrain()
if err == nil {
Expand Down

0 comments on commit e10d43a

Please sign in to comment.