From e88891f1cd43826b79899a3743925ea13b1b9631 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Tue, 2 Feb 2021 20:29:30 -0800 Subject: [PATCH] test: incorporate kured + auto mode into kamino vmss-prototype tests --- test/e2e/kubernetes/kubernetes_test.go | 31 ++++- test/e2e/kubernetes/node/node.go | 64 +++++++-- .../workloads/kured-annotations.yaml | 129 ++++++++++++++++++ 3 files changed, 209 insertions(+), 15 deletions(-) create mode 100644 test/e2e/kubernetes/workloads/kured-annotations.yaml diff --git a/test/e2e/kubernetes/kubernetes_test.go b/test/e2e/kubernetes/kubernetes_test.go index 8f18600d19..f2561e3515 100644 --- a/test/e2e/kubernetes/kubernetes_test.go +++ b/test/e2e/kubernetes/kubernetes_test.go @@ -2798,6 +2798,11 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu It("should be able to install vmss node prototype", func() { if cfg.RunVMSSNodePrototype { if eng.ExpandedDefinition.Properties.HasVMSSAgentPool() { + By("Installing kured with node annotations configuration") + cmd := exec.Command("k", "apply", "-f", filepath.Join(WorkloadDir, "kured-annotations.yaml")) + util.PrintCommand(cmd) + _, err := cmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred()) nodes, err := node.GetReadyWithRetry(1*time.Second, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) var numAgentNodes int @@ -2824,6 +2829,24 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu elapsed := time.Since(start) log.Printf("Took %s to schedule %d Pods with large containers via DaemonSet\n", elapsed, numLargeContainerPods) } + By("Marking all nodes as needing reboots") + for _, n := range nodes { + if n.IsLinux() && !controlPlaneNodeRegexp.MatchString(n.Metadata.Name) { + err = sshConn.ExecuteRemoteWithRetry(n.Metadata.Name, fmt.Sprintf("\"sudo touch /var/run/reboot-required\""), false, 30*time.Second, cfg.Timeout) + Expect(err).NotTo(HaveOccurred()) + } + } + By("Waiting for one node to be marked as SchedulingDisabled by kured") + ready := node.WaitOnReadyMax(len(nodes)-1, 5*time.Second, cfg.Timeout) + Expect(ready).To(BeTrue()) + By("Waiting for nodes to be be rebooted and annotated correctly") + _, err = node.WaitForNodesWithAnnotation(numAgentNodes, "weave.works/kured-most-recent-reboot-needed", "", 5*time.Second, cfg.Timeout) + Expect(err).NotTo(HaveOccurred()) + _, err = node.WaitForNodesWithAnnotation(0, "weave.works/kured-reboot-in-progress", "", 1*time.Minute, cfg.Timeout) + Expect(err).NotTo(HaveOccurred()) + By("Waiting for all nodes to be Ready again") + ready = node.WaitOnReady(len(nodes), 30*time.Second, cfg.Timeout) + Expect(ready).To(Equal(true)) By("Choosing a target VMSS node to use as the prototype") var targetNode string for _, n := range nodes { @@ -2878,7 +2901,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu timeToLargeContainerDaemonsetRunningBaseline = time.Since(start) log.Printf("Took %s for large-container-daemonset pod to reach Running state on new node\n", timeToLargeContainerDaemonsetRunningBaseline) } - cmd := exec.Command("helm", "status", "vmss-prototype") + cmd = exec.Command("helm", "status", "vmss-prototype") out, err := cmd.CombinedOutput() if err == nil { By("Found pre-existing 'vmss-prototype' helm release, deleting it...") @@ -2893,7 +2916,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu } else { commandArgsSlice = append(commandArgsSlice, []string{"vmss-prototype", cfg.KaminoVMSSPrototypeLocalChartPath}...) } - commandArgsSlice = append(commandArgsSlice, []string{"--namespace", "default", "--set", "kamino.scheduleOnControlPlane=true", "--set", "kamino.newUpdatedNodes=2", "--set", "kamino.logLevel=DEBUG"}...) + commandArgsSlice = append(commandArgsSlice, []string{"--namespace", "default", "--set", "kamino.scheduleOnControlPlane=true", "--set", "kamino.newUpdatedNodes=2", "--set", "kamino.logLevel=DEBUG", "--set", fmt.Sprintf("kamino.targetVMSS=%s", vmssName), "--set", "kamino.auto.lastPatchAnnotation=weave.works/kured-most-recent-reboot-needed", "--set", "kamino.auto.pendingRebootAnnotation=weave.works/kured-reboot-in-progress", "--set", "kamino.auto.minimumReadyTime=1s"}...) if cfg.KaminoVMSSPrototypeImageRegistry != "" { commandArgsSlice = append(commandArgsSlice, []string{"--set", fmt.Sprintf("kamino.container.imageRegistry=%s", cfg.KaminoVMSSPrototypeImageRegistry)}...) } @@ -2904,9 +2927,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu commandArgsSlice = append(commandArgsSlice, []string{"--set", fmt.Sprintf("kamino.container.imageTag=%s", cfg.KaminoVMSSPrototypeImageTag), "--set", "kamino.container.pullByHash=false"}...) } if cfg.KaminoVMSSPrototypeDryRun { - commandArgsSlice = append(commandArgsSlice, []string{"--set", fmt.Sprintf("kamino.targetVMSS=%s", vmssName), "--set", "kamino.auto.lastPatchAnnotation=weave.works/kured-most-recent-reboot-needed", "--set", "kamino.auto.pendingRebootAnnotation=weave.works/kured-reboot-in-progress", "--set", "kamino.auto.dryRun=true"}...) - } else { - commandArgsSlice = append(commandArgsSlice, []string{"--set", fmt.Sprintf("kamino.targetNode=%s", targetNode)}...) + commandArgsSlice = append(commandArgsSlice, []string{"--set", "kamino.auto.dryRun=true"}...) } cmd = exec.Command("helm", commandArgsSlice...) util.PrintCommand(cmd) diff --git a/test/e2e/kubernetes/node/node.go b/test/e2e/kubernetes/node/node.go index 31ce109f95..c8836a2e22 100644 --- a/test/e2e/kubernetes/node/node.go +++ b/test/e2e/kubernetes/node/node.go @@ -384,17 +384,12 @@ func AreMaxNodesReady(nodeCount int) bool { var ready int if list != nil { for _, node := range list.Nodes { - nodeReady := node.IsReady() - if !nodeReady { - return false + if node.IsReady() { + ready++ } - ready++ } } - if ready <= nodeCount { - return true - } - return false + return ready <= nodeCount } // WaitOnReady will block until all nodes are in ready state @@ -484,6 +479,40 @@ func WaitOnReadyMax(nodeCount int, sleep, timeout time.Duration) bool { } } +// WaitForNodesWithAnnotation will wait until the desired number of nodes have a particular annotation +func WaitForNodesWithAnnotation(nodeCount int, key, val string, sleep, timeout time.Duration) ([]Node, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan GetNodesResult) + var mostRecentWaitForNodesWithAnnotationError error + var nodes []Node + go func() { + for { + select { + case <-ctx.Done(): + return + default: + ch <- GetByAnnotationsAsync(key, val) + time.Sleep(sleep) + } + } + }() + for { + select { + case result := <-ch: + mostRecentWaitForNodesWithAnnotationError = result.Err + nodes = result.Nodes + if mostRecentWaitForNodesWithAnnotationError == nil { + if len(nodes) == nodeCount { + return nodes, nil + } + } + case <-ctx.Done(): + return nil, errors.Errorf("WaitForNodesWithAnnotation timed out: %s\n", mostRecentWaitForNodesWithAnnotationError) + } + } +} + // Get returns the current nodes for a given kubeconfig func Get() (*List, error) { cmd := exec.Command("k", "get", "nodes", "-o", "json") @@ -724,6 +753,15 @@ func GetByLabel(label string) ([]Node, error) { return nodes, nil } +// GetByAnnotationsAsync wraps GetByAnnotations with a struct response for goroutine + channel usage +func GetByAnnotationsAsync(key, value string) GetNodesResult { + nodes, err := GetByAnnotations(key, value) + return GetNodesResult{ + Nodes: nodes, + Err: err, + } +} + // GetByAnnotations will return a []Node of all nodes that have a matching annotation func GetByAnnotations(key, value string) ([]Node, error) { list, err := Get() @@ -733,8 +771,14 @@ func GetByAnnotations(key, value string) ([]Node, error) { nodes := make([]Node, 0) for _, n := range list.Nodes { - if n.Metadata.Annotations[key] == value { - nodes = append(nodes, n) + if value != "" { + if n.Metadata.Annotations[key] == value { + nodes = append(nodes, n) + } + } else { + if _, ok := n.Metadata.Annotations[key]; ok { + nodes = append(nodes, n) + } } } return nodes, nil diff --git a/test/e2e/kubernetes/workloads/kured-annotations.yaml b/test/e2e/kubernetes/workloads/kured-annotations.yaml new file mode 100644 index 0000000000..06f307243f --- /dev/null +++ b/test/e2e/kubernetes/workloads/kured-annotations.yaml @@ -0,0 +1,129 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kured +rules: +# Allow kured to read spec.unschedulable +# Allow kubectl to drain/uncordon +# +# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below +# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go +# +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "patch"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["list","delete","get"] +- apiGroups: ["apps"] + resources: ["daemonsets"] + verbs: ["get"] +- apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kured +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kured +subjects: +- kind: ServiceAccount + name: kured + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: kube-system + name: kured +rules: +# Allow kured to lock/unlock itself +- apiGroups: ["apps"] + resources: ["daemonsets"] + resourceNames: ["kured"] + verbs: ["update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + namespace: kube-system + name: kured +subjects: +- kind: ServiceAccount + namespace: kube-system + name: kured +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kured +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kured + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kured # Must match `--ds-name` + namespace: kube-system # Must match `--ds-namespace` +spec: + selector: + matchLabels: + name: kured + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: kured + spec: + serviceAccountName: kured + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + hostPID: true # Facilitate entering the host mount namespace via init + restartPolicy: Always + containers: + - name: kured + image: docker.io/jackfrancis/kured:node-annotations-chart-e9de81b + # If you find yourself here wondering why there is no + # :latest tag on Docker Hub,see the FAQ in the README + imagePullPolicy: IfNotPresent + securityContext: + privileged: true # Give permission to nsenter /proc/1/ns/mnt + env: + # Pass in the name of the node on which this pod is scheduled + # for use with drain/uncordon operations and lock acquisition + - name: KURED_NODE_ID + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/bin/kured +# - --alert-filter-regexp=^RebootRequired$ +# - --blocking-pod-selector=runtime=long,cost=expensive +# - --blocking-pod-selector=name=temperamental +# - --blocking-pod-selector=... +# - --ds-name=kured +# - --ds-namespace=kube-system +# - --end-time=23:59:59 +# - --lock-annotation=weave.works/kured-node-lock + - --period=1m +# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local +# - --reboot-days=sun,mon,tue,wed,thu,fri,sat +# - --reboot-sentinel=/var/run/reboot-required +# - --slack-hook-url=https://hooks.slack.com/... +# - --slack-username=prod +# - --slack-channel=alerting +# - --message-template-drain=Draining node %s +# - --message-template-drain=Rebooting node %s +# - --start-time=0:00 +# - --time-zone=UTC + - --annotate-nodes=true