Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add new extra component to --wait=all to validate a healthy cluster #10424

Merged
merged 6 commits into from
Feb 16, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions pkg/minikube/bootstrapper/bsutil/kverify/kverify.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,18 @@ const (
NodeReadyKey = "node_ready"
// KubeletKey is the name used in the flags for waiting for the kubelet status to be ready
KubeletKey = "kubelet"
// ExtraKey is the name used for extra waiting for pods in CorePodsList to be Ready
ExtraKey = "extra"
)

// vars related to the --wait flag
var (
// DefaultComponents is map of the the default components to wait for
DefaultComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true}
// NoWaitComponents is map of componets to wait for if specified 'none' or 'false'
NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false, KubeletKey: false}
NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false, KubeletKey: false, ExtraKey: false}
// AllComponents is map for waiting for all components.
AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true, NodeReadyKey: true, KubeletKey: true}
AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true, NodeReadyKey: true, KubeletKey: true, ExtraKey: true}
// DefaultWaitList is list of all default components to wait for. only names to be used for start flags.
DefaultWaitList = []string{APIServerWaitKey, SystemPodsWaitKey}
// AllComponentsList list of all valid components keys to wait for. only names to be used used for start flags.
Expand All @@ -60,6 +62,15 @@ var (
"kube-proxy",
"kube-scheduler",
}
// CorePodsList is a list of essential pods for running kurnetes to extra wait for them to be Ready
CorePodsList = []string{
"kube-dns", // coredns
"etcd",
"kube-apiserver",
"kube-controller-manager",
"kube-proxy",
"kube-scheduler",
}
)

// ShouldWait will return true if the config says need to wait
Expand Down
175 changes: 175 additions & 0 deletions pkg/minikube/bootstrapper/bsutil/kverify/pod_ready.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
/*
Copyright 2021 The Kubernetes Authors All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package kverify verifies a running Kubernetes cluster is healthy
package kverify

import (
"fmt"
"strings"
"time"

"github.com/pkg/errors"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants"
)

// WaitExtra calls WaitForPodReadyByLabel for each pod in labels list and returns any errors occurred.
func WaitExtra(cs *kubernetes.Clientset, labels []string, timeout time.Duration) error {
klog.Infof("extra waiting for kube-system core pods %s to be Ready ...", labels)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %s for extra waiting for kube-system core pods to be Ready ...", time.Since(start))
}()

var errs []string
for _, label := range labels {
if err := WaitForPodReadyByLabel(cs, label, "kube-system", timeout); err != nil {
errs = append(errs, fmt.Sprintf("%q: %q", label, err.Error()))
}
}
if errs != nil {
return fmt.Errorf(strings.Join(errs, ", "))
}

return nil
}

// WaitForPodReadyByLabel waits for pod with label ([key:]val) in a namespace to be in Ready condition.
// If namespace is not provided, it defaults to "kube-system".
// If label key is not provided, it will try with "component" and "k8s-app".
func WaitForPodReadyByLabel(cs *kubernetes.Clientset, label, namespace string, timeout time.Duration) error {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this func is not used outside, make private.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done!

klog.Infof("waiting %v for pod with %q label in %q namespace to be Ready ...", timeout, label, namespace)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %v to run WaitForPodReadyByLabel for pod with %q label in %q namespace ...", time.Since(start), label, namespace)
}()

if namespace == "" {
namespace = "kube-system"
}

lkey := ""
lval := ""
l := strings.Split(label, ":")
switch len(l) {
case 1: // treat as no label key provided, just val
lval = strings.TrimSpace(l[0])
case 2:
lkey = strings.TrimSpace(l[0])
lval = strings.TrimSpace(l[1])
default:
return fmt.Errorf("pod label %q is malformed", label)
}

lap := time.Now()
checkReady := func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("wait for pod with %q label in %q namespace to be Ready timed out", label, namespace)
}
pods, err := cs.CoreV1().Pods(namespace).List(meta.ListOptions{})
if err != nil {
klog.Infof("error listing pods in %q namespace, will retry: %v", namespace, err)
return false, nil
}
for _, pod := range pods.Items {
for k, v := range pod.ObjectMeta.Labels {
if ((lkey == "" && (k == "component" || k == "k8s-app")) || lkey == k) && v == lval {
ready, reason := IsPodReady(&pod)
if ready {
klog.Info(reason)
return true, nil
}
// reduce log spam
if time.Since(lap) > (1 * time.Second) {
klog.Info(reason)
lap = time.Now()
}
return false, nil
}
}
}
klog.Infof("pod with %q label in %q namespace was not found, will retry", label, namespace)
return false, nil
}
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
return errors.Wrapf(err, "wait pod Ready")
}

return nil
}

// WaitForPodReadyByName waits for pod with name in a namespace to be in Ready condition.
// If namespace is not provided, it defaults to "kube-system".
func WaitForPodReadyByName(cs *kubernetes.Clientset, name, namespace string, timeout time.Duration) error {
klog.Infof("waiting %v for pod %q in %q namespace to be Ready ...", timeout, name, namespace)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %v to run WaitForPodReadyByName for pod %q in %q namespace ...", time.Since(start), name, namespace)
}()

if namespace == "" {
namespace = "kube-system"
}

lap := time.Now()
checkReady := func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("wait for pod %q in %q namespace to be Ready timed out", name, namespace)
}
pod, err := cs.CoreV1().Pods(namespace).Get(name, meta.GetOptions{})
if err != nil {
klog.Infof("error getting pod %q in %q namespace, will retry: %v", name, namespace, err)
return false, nil
}
ready, reason := IsPodReady(pod)
if ready {
klog.Info(reason)
return true, nil
}
// reduce log spam
if time.Since(lap) > (1 * time.Second) {
klog.Info(reason)
lap = time.Now()
}
return false, nil
}
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
return errors.Wrapf(err, "wait pod Ready")
}

return nil
}

// IsPodReady returns if pod is Ready and verbose reason.
func IsPodReady(pod *core.Pod) (ready bool, reason string) {
if pod.Status.Phase != core.PodRunning {
return false, fmt.Sprintf("pod %q in %q namespace is not Running: %+v", pod.Name, pod.Namespace, pod.Status)
}
for _, c := range pod.Status.Conditions {
if c.Type == core.PodReady {
if c.Status != core.ConditionTrue {
return false, fmt.Sprintf("pod %q in %q namespace is not Ready: %+v", pod.Name, pod.Namespace, c)
}
return true, fmt.Sprintf("pod %q in %q namespace is Ready: %+v", pod.Name, pod.Namespace, c)
}
}
return false, fmt.Sprintf("pod %q in %q namespace does not have %q status: %+v", pod.Name, pod.Namespace, core.PodReady, pod.Status)
}
39 changes: 38 additions & 1 deletion pkg/minikube/bootstrapper/kubeadm/kubeadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/docker/machine/libmachine"
"github.com/docker/machine/libmachine/state"
"github.com/pkg/errors"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -470,6 +471,12 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
return nil
}

if cfg.VerifyComponents[kverify.ExtraKey] {
if err := kverify.WaitExtra(client, kverify.CorePodsList, timeout); err != nil {
return errors.Wrap(err, "extra waiting")
}
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrapf(err, "create runtme-manager %s", cfg.KubernetesConfig.ContainerRuntime)
Expand Down Expand Up @@ -504,11 +511,11 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
}
}
}

if cfg.VerifyComponents[kverify.KubeletKey] {
if err := kverify.WaitForService(k.c, "kubelet", timeout); err != nil {
return errors.Wrap(err, "waiting for kubelet")
}

}

if cfg.VerifyComponents[kverify.NodeReadyKey] {
Expand Down Expand Up @@ -658,6 +665,35 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
}
}

if cfg.VerifyComponents[kverify.ExtraKey] {
// after kubelet is restarted (with 'kubeadm init phase kubelet-start' above),
// it appears as to be immediately Ready as well as all kube-system pods,
// then (after ~10sec) it realises it has some changes to apply, implying also pods restarts,
// and by that time we would exit completely, so we wait until kubelet begins restarting pods
klog.Info("waiting for restarted kubelet to initialise ...")
start := time.Now()
wait := func() error {
pods, err := client.CoreV1().Pods("kube-system").List(meta.ListOptions{})
if err != nil {
return err
}
for _, pod := range pods.Items {
if pod.Labels["tier"] == "control-plane" {
if ready, _ := kverify.IsPodReady(&pod); !ready {
return nil
}
}
}
return fmt.Errorf("kubelet not initialised")
}
_ = retry.Expo(wait, 250*time.Millisecond, 1*time.Minute)
klog.Infof("kubelet initialised")
klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start))
if err := kverify.WaitExtra(client, kverify.CorePodsList, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "extra")
}
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrap(err, "runtime")
Expand Down Expand Up @@ -698,6 +734,7 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
if err := bsutil.AdjustResourceLimits(k.c); err != nil {
klog.Warningf("unable to adjust resource limits: %v", err)
}

return nil
}

Expand Down
26 changes: 17 additions & 9 deletions test/integration/functional_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,6 @@ func validateDockerEnv(ctx context.Context, t *testing.T, profile string) {
if !strings.Contains(rr.Output(), expectedImgInside) {
t.Fatalf("expected 'docker images' to have %q inside minikube. but the output is: *%s*", expectedImgInside, rr.Output())
}

}

func validateStartWithProxy(ctx context.Context, t *testing.T, profile string) {
Expand All @@ -269,7 +268,7 @@ func validateStartWithProxy(ctx context.Context, t *testing.T, profile string) {

// Use more memory so that we may reliably fit MySQL and nginx
// changing api server so later in soft start we verify it didn't change
startArgs := append([]string{"start", "-p", profile, "--memory=4000", fmt.Sprintf("--apiserver-port=%d", apiPortTest), "--wait=true"}, StartArgs()...)
startArgs := append([]string{"start", "-p", profile, "--memory=4000", fmt.Sprintf("--apiserver-port=%d", apiPortTest), "--wait=all"}, StartArgs()...)
c := exec.CommandContext(ctx, Target(), startArgs...)
env := os.Environ()
env = append(env, fmt.Sprintf("HTTP_PROXY=%s", srv.Addr))
Expand Down Expand Up @@ -401,15 +400,14 @@ func validateMinikubeKubectlDirectCall(ctx context.Context, t *testing.T, profil
if err != nil {
t.Fatalf("failed to run kubectl directly. args %q: %v", rr.Command(), err)
}

}

func validateExtraConfig(ctx context.Context, t *testing.T, profile string) {
defer PostMortemLogs(t, profile)

start := time.Now()
// The tests before this already created a profile, starting minikube with different --extra-config cmdline option.
startArgs := []string{"start", "-p", profile, "--extra-config=apiserver.enable-admission-plugins=NamespaceAutoProvision"}
startArgs := []string{"start", "-p", profile, "--extra-config=apiserver.enable-admission-plugins=NamespaceAutoProvision", "--wait=all"}
c := exec.CommandContext(ctx, Target(), startArgs...)
rr, err := Run(t, c)
if err != nil {
Expand All @@ -427,7 +425,6 @@ func validateExtraConfig(ctx context.Context, t *testing.T, profile string) {
if !strings.Contains(afterCfg.Config.KubernetesConfig.ExtraOptions.String(), expectedExtraOptions) {
t.Errorf("expected ExtraOptions to contain %s but got %s", expectedExtraOptions, afterCfg.Config.KubernetesConfig.ExtraOptions.String())
}

}

// imageID returns a docker image id for image `image` and current architecture
Expand All @@ -451,6 +448,7 @@ func imageID(image string) string {
}

// validateComponentHealth asserts that all Kubernetes components are healthy
// note: it expects all components to be Ready, so it makes sense to run it close after only those tests that include '--wait=all' start flag (ie, with extra wait)
func validateComponentHealth(ctx context.Context, t *testing.T, profile string) {
defer PostMortemLogs(t, profile)

Expand All @@ -474,12 +472,22 @@ func validateComponentHealth(ctx context.Context, t *testing.T, profile string)

for _, i := range cs.Items {
for _, l := range i.Labels {
t.Logf("%s phase: %s", l, i.Status.Phase)
_, ok := found[l]
if ok {
if _, ok := found[l]; ok { // skip irrelevant (eg, repeating/redundant '"tier": "control-plane"') labels
found[l] = true
if i.Status.Phase != "Running" {
t.Logf("%s phase: %s", l, i.Status.Phase)
if i.Status.Phase != api.PodRunning {
t.Errorf("%s is not Running: %+v", l, i.Status)
continue
}
for _, c := range i.Status.Conditions {
if c.Type == api.PodReady {
if c.Status != api.ConditionTrue {
t.Errorf("%s is not Ready: %+v", l, i.Status)
} else {
t.Logf("%s status: %s", l, c.Type)
}
break
}
}
}
}
Expand Down