From 359da188036cf31d52bcdb476998d5d5b6f288ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Mattrat?= Date: Tue, 4 May 2021 19:05:08 +0200 Subject: [PATCH 1/6] Advertise the readiness of the driver on each node. Set the label gcs.csi.ofek.dev/driver-ready={true,false} on each node running a controller --- deploy/base/rbac.yaml | 2 +- pkg/driver/driver.go | 13 ++++++++++++- pkg/util/common.go | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/deploy/base/rbac.yaml b/deploy/base/rbac.yaml index 2a0c44a..7e92387 100644 --- a/deploy/base/rbac.yaml +++ b/deploy/base/rbac.yaml @@ -16,7 +16,7 @@ rules: verbs: ["delete"] - apiGroups: [""] resources: ["nodes"] - verbs: ["get", "list", "update"] + verbs: ["get", "list", "update", "patch"] - apiGroups: [""] resources: ["namespaces"] verbs: ["get", "list"] diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index e60a954..835c9b6 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -38,6 +38,11 @@ func NewGCSDriver(name, node, endpoint string, version string, deleteOrphanedPod } func (d *GCSDriver) Run() error { + // set the driver-ready label to false at the beginning to handle edge-case where the controller didn't terminated gracefully + if err := util.SetDriverReadyLabel(d.nodeName, false); err != nil { + klog.V(4).Infof("Unable to set driver-ready=false label on the node, error: %v", err) + } + if len(d.mountPoint) == 0 { return errors.New("--bucket-mount-path is required") } @@ -75,11 +80,17 @@ func (d *GCSDriver) Run() error { csi.RegisterIdentityServer(d.server, d) csi.RegisterNodeServer(d.server, d) csi.RegisterControllerServer(d.server, d) + if err = util.SetDriverReadyLabel(d.nodeName, true); err != nil { + klog.V(4).Infof("unable to set driver-ready=true label on the node, error: %v", err) + } return d.server.Serve(listener) } func (d *GCSDriver) stop() { d.server.Stop() + if err := util.SetDriverReadyLabel(d.nodeName, false); err != nil { + klog.V(4).Infof("Unable to set driver-ready=false label on the node, error: %v", err) + } klog.V(1).Info("CSI driver stopped") } @@ -93,7 +104,7 @@ func (d *GCSDriver) RunPodCleanup() (err error) { // Killing Pod because its Volume is no longer mounted err = util.DeletePod(publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name) if err == nil { - klog.V(4).Infof("Deleted Pod %s/%s bacause its volume was no longer mounted", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name) + klog.V(4).Infof("Deleted Pod %s/%s because its volume was no longer mounted", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name) } else { klog.Errorf("Could not delete pod %s/%s because it was no longer mounted because of error: %v", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name, err) } diff --git a/pkg/util/common.go b/pkg/util/common.go index d98dc5b..9c739ce 100644 --- a/pkg/util/common.go +++ b/pkg/util/common.go @@ -2,6 +2,7 @@ package util import ( "context" + "encoding/json" "fmt" "hash/crc32" "io/ioutil" @@ -20,6 +21,7 @@ import ( "google.golang.org/grpc/status" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/klog" @@ -185,6 +187,38 @@ func GetPvcAnnotations(pvcName string, pvcNamespace string) (annotations map[str return pvc.ObjectMeta.Annotations, nil } +// SetDriverReadyLabel set the label gcs.csi.ofek.dev/driver-ready= on the given node. +func SetDriverReadyLabel(nodeName string, isReady bool) (err error) { + config, err := rest.InClusterConfig() + if err != nil { + return err + } + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return err + } + + patch := []struct { + Op string `json:"op"` + Path string `json:"path"` + Value string `json:"value"` + }{{ + Op: "replace", + Path: "/metadata/labels/gcs.csi.ofek.dev~1driver-ready", + Value: strconv.FormatBool(isReady), + }} + patchBytes, err := json.Marshal(patch) + if err != nil { + return err + } + + _, err = clientset.CoreV1().Nodes().Patch(nodeName, types.JSONPatchType, patchBytes) + if err != nil { + return err + } + return nil +} + func DeletePod(namespace string, name string) (err error) { config, err := rest.InClusterConfig() if err != nil { From 5f9e056912c9d78d256619b7c3ad221c1bb4af49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Mattrat?= Date: Tue, 4 May 2021 20:54:10 +0200 Subject: [PATCH 2/6] upgrade driver-ready set failure to warns --- pkg/driver/driver.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index 835c9b6..af9d371 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -40,7 +40,7 @@ func NewGCSDriver(name, node, endpoint string, version string, deleteOrphanedPod func (d *GCSDriver) Run() error { // set the driver-ready label to false at the beginning to handle edge-case where the controller didn't terminated gracefully if err := util.SetDriverReadyLabel(d.nodeName, false); err != nil { - klog.V(4).Infof("Unable to set driver-ready=false label on the node, error: %v", err) + klog.Warningf("Unable to set driver-ready=false label on the node, error: %v", err) } if len(d.mountPoint) == 0 { @@ -81,7 +81,7 @@ func (d *GCSDriver) Run() error { csi.RegisterNodeServer(d.server, d) csi.RegisterControllerServer(d.server, d) if err = util.SetDriverReadyLabel(d.nodeName, true); err != nil { - klog.V(4).Infof("unable to set driver-ready=true label on the node, error: %v", err) + klog.Warningf("unable to set driver-ready=true label on the node, error: %v", err) } return d.server.Serve(listener) } @@ -89,7 +89,7 @@ func (d *GCSDriver) Run() error { func (d *GCSDriver) stop() { d.server.Stop() if err := util.SetDriverReadyLabel(d.nodeName, false); err != nil { - klog.V(4).Infof("Unable to set driver-ready=false label on the node, error: %v", err) + klog.Warningf("Unable to set driver-ready=false label on the node, error: %v", err) } klog.V(1).Info("CSI driver stopped") } From 8c2f50d446cf486503dc2edb02e5ef81bf4ea2d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Mattrat?= Date: Sat, 8 May 2021 16:37:24 +0200 Subject: [PATCH 3/6] Document how gcs.csi.ofek.dev/driver-ready can be used. --- docs/troubleshooting.md | 45 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 8548324..90e59da 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -1,5 +1,46 @@ # Troubleshooting +## Warnings events from pods scheduled on newly started nodes when mounting csi-gcs volumes ------ +Warnings, like the one below, can be seen from pods scheduled on newly started nodes. -## TODO +``` +MountVolume.MountDevice failed for volume "xxxx" : kubernetes.io/csi: attacher.MountDevice failed to create newCsiDriverClient: driver name gcs.csi.ofek.dev not found in the list of registered CSI drivers +``` + +Those warnings are temporary and reflect the csi-gcs driver is still starting. Kubernetes will retry until the csi-gcs driver is ready. + +It's possible to avoid those warnings by adding a node selector or affinity using the node label `gcs.csi.ofek.dev/driver-ready=true`. + +> Adding such node selector or affinity will trade the time spend waiting for volume mounting retries against time waiting for scheduling. + + +``` +apiVersion: v1 +kind: Pod +metadata: + name: pod-mount-csi-gcs-volume +spec: + // ... + nodeSelector: + gcs.csi.ofek.dev/driver-ready: "true" +``` + +``` +apiVersion: v1 +kind: Pod +metadata: + name: pod-mount-csi-gcs-volume +spec: + // ... + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: gcs.csi.ofek.dev/driver-ready + operator: In + values: + - "true" +``` + +You can also add an admission mutating webhook to automatically inject such node selector or affinity in all pods mounting csi-gcs volumes. From b5463b9ad61417a39f0a20ecca512e38750ca6d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Mattrat?= Date: Sat, 8 May 2021 16:40:01 +0200 Subject: [PATCH 4/6] documentation reword --- docs/troubleshooting.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 90e59da..8466d74 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -11,7 +11,7 @@ Those warnings are temporary and reflect the csi-gcs driver is still starting. K It's possible to avoid those warnings by adding a node selector or affinity using the node label `gcs.csi.ofek.dev/driver-ready=true`. -> Adding such node selector or affinity will trade the time spend waiting for volume mounting retries against time waiting for scheduling. +> Adding such node selector or affinity will trade the time spend waiting for volume mounting retries with time waiting for scheduling. ``` From b13fb3c307c2123deb64641cf38654eea9de6f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Mattrat?= Date: Sun, 9 May 2021 17:48:54 +0200 Subject: [PATCH 5/6] According to JSON patch RFC, add should be used here, not replace even if k8s implementation seems forgiveful. --- pkg/util/common.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/common.go b/pkg/util/common.go index 9c739ce..c238660 100644 --- a/pkg/util/common.go +++ b/pkg/util/common.go @@ -203,7 +203,7 @@ func SetDriverReadyLabel(nodeName string, isReady bool) (err error) { Path string `json:"path"` Value string `json:"value"` }{{ - Op: "replace", + Op: "add", Path: "/metadata/labels/gcs.csi.ofek.dev~1driver-ready", Value: strconv.FormatBool(isReady), }} From 994483a4b995fee68480b9ce4f888440de688d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Mattrat?= Date: Sun, 9 May 2021 17:50:02 +0200 Subject: [PATCH 6/6] Honnor driver-name flag when setting the driver-ready label on nodes. --- docs/troubleshooting.md | 1 + pkg/driver/driver.go | 6 +++--- pkg/util/common.go | 18 ++++++++++++++---- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 8466d74..e86e21a 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -13,6 +13,7 @@ It's possible to avoid those warnings by adding a node selector or affinity usin > Adding such node selector or affinity will trade the time spend waiting for volume mounting retries with time waiting for scheduling. +> The exact label added is `/driver-ready`, by default `` is `gcs.csi.ofek.dev` ``` apiVersion: v1 diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index af9d371..e3fe965 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -39,7 +39,7 @@ func NewGCSDriver(name, node, endpoint string, version string, deleteOrphanedPod func (d *GCSDriver) Run() error { // set the driver-ready label to false at the beginning to handle edge-case where the controller didn't terminated gracefully - if err := util.SetDriverReadyLabel(d.nodeName, false); err != nil { + if err := util.SetDriverReadyLabel(d.name, d.nodeName, false); err != nil { klog.Warningf("Unable to set driver-ready=false label on the node, error: %v", err) } @@ -80,7 +80,7 @@ func (d *GCSDriver) Run() error { csi.RegisterIdentityServer(d.server, d) csi.RegisterNodeServer(d.server, d) csi.RegisterControllerServer(d.server, d) - if err = util.SetDriverReadyLabel(d.nodeName, true); err != nil { + if err = util.SetDriverReadyLabel(d.name, d.nodeName, true); err != nil { klog.Warningf("unable to set driver-ready=true label on the node, error: %v", err) } return d.server.Serve(listener) @@ -88,7 +88,7 @@ func (d *GCSDriver) Run() error { func (d *GCSDriver) stop() { d.server.Stop() - if err := util.SetDriverReadyLabel(d.nodeName, false); err != nil { + if err := util.SetDriverReadyLabel(d.name, d.nodeName, false); err != nil { klog.Warningf("Unable to set driver-ready=false label on the node, error: %v", err) } klog.V(1).Info("CSI driver stopped") diff --git a/pkg/util/common.go b/pkg/util/common.go index c238660..41de297 100644 --- a/pkg/util/common.go +++ b/pkg/util/common.go @@ -187,8 +187,18 @@ func GetPvcAnnotations(pvcName string, pvcNamespace string) (annotations map[str return pvc.ObjectMeta.Annotations, nil } -// SetDriverReadyLabel set the label gcs.csi.ofek.dev/driver-ready= on the given node. -func SetDriverReadyLabel(nodeName string, isReady bool) (err error) { +// DriverReadyLabel returns the driver-ready label according to the driver name. +func DriverReadyLabel(driverName string) string { + return driverName + "/driver-ready" +} + +// DriverReadyLabelJSONPatchEscaped returns the driver-ready label according to the driver name but espcaed to be used in a JSONPatch path. +func DriverReadyLabelJSONPatchEscaped(driverName string) string { + return strings.ReplaceAll(DriverReadyLabel(driverName), "/", "~1") +} + +// SetDriverReadyLabel set the label /driver-ready= on the given node. +func SetDriverReadyLabel(driverName string, nodeName string, isReady bool) (err error) { config, err := rest.InClusterConfig() if err != nil { return err @@ -204,14 +214,14 @@ func SetDriverReadyLabel(nodeName string, isReady bool) (err error) { Value string `json:"value"` }{{ Op: "add", - Path: "/metadata/labels/gcs.csi.ofek.dev~1driver-ready", + Path: "/metadata/labels/" + DriverReadyLabelJSONPatchEscaped(driverName), Value: strconv.FormatBool(isReady), }} patchBytes, err := json.Marshal(patch) if err != nil { return err } - + _, err = clientset.CoreV1().Nodes().Patch(nodeName, types.JSONPatchType, patchBytes) if err != nil { return err