Skip to content

Commit

Permalink
Advertise the readiness of the driver on each node. (#72)
Browse files Browse the repository at this point in the history
* Advertise the readiness of the driver on each node.

Set the label gcs.csi.ofek.dev/driver-ready={true,false}
on each node running a controller

* upgrade driver-ready set failure to warns

* Document how gcs.csi.ofek.dev/driver-ready can be used.

* documentation reword

* According to JSON patch RFC, add should be used here, not replace even if k8s implementation seems forgiveful.

* Honnor driver-name flag when setting the driver-ready label on nodes.
  • Loading branch information
dysosmus authored May 14, 2021
1 parent de2828f commit 5ebbc41
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 4 deletions.
2 changes: 1 addition & 1 deletion deploy/base/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ rules:
verbs: ["delete"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "update"]
verbs: ["get", "list", "update", "patch"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
Expand Down
46 changes: 44 additions & 2 deletions docs/troubleshooting.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,47 @@
# Troubleshooting
## Warnings events from pods scheduled on newly started nodes when mounting csi-gcs volumes

-----
Warnings, like the one below, can be seen from pods scheduled on newly started nodes.

## TODO
```
MountVolume.MountDevice failed for volume "xxxx" : kubernetes.io/csi: attacher.MountDevice failed to create newCsiDriverClient: driver name gcs.csi.ofek.dev not found in the list of registered CSI drivers
```

Those warnings are temporary and reflect the csi-gcs driver is still starting. Kubernetes will retry until the csi-gcs driver is ready.

It's possible to avoid those warnings by adding a node selector or affinity using the node label `gcs.csi.ofek.dev/driver-ready=true`.

> Adding such node selector or affinity will trade the time spend waiting for volume mounting retries with time waiting for scheduling.
> The exact label added is `<driver name>/driver-ready`, by default `<driver name>` is `gcs.csi.ofek.dev`
```
apiVersion: v1
kind: Pod
metadata:
name: pod-mount-csi-gcs-volume
spec:
// ...
nodeSelector:
gcs.csi.ofek.dev/driver-ready: "true"
```

```
apiVersion: v1
kind: Pod
metadata:
name: pod-mount-csi-gcs-volume
spec:
// ...
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: gcs.csi.ofek.dev/driver-ready
operator: In
values:
- "true"
```

You can also add an admission mutating webhook to automatically inject such node selector or affinity in all pods mounting csi-gcs volumes.
13 changes: 12 additions & 1 deletion pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ func NewGCSDriver(name, node, endpoint string, version string, deleteOrphanedPod
}

func (d *GCSDriver) Run() error {
// set the driver-ready label to false at the beginning to handle edge-case where the controller didn't terminated gracefully
if err := util.SetDriverReadyLabel(d.name, d.nodeName, false); err != nil {
klog.Warningf("Unable to set driver-ready=false label on the node, error: %v", err)
}

if len(d.mountPoint) == 0 {
return errors.New("--bucket-mount-path is required")
}
Expand Down Expand Up @@ -75,11 +80,17 @@ func (d *GCSDriver) Run() error {
csi.RegisterIdentityServer(d.server, d)
csi.RegisterNodeServer(d.server, d)
csi.RegisterControllerServer(d.server, d)
if err = util.SetDriverReadyLabel(d.name, d.nodeName, true); err != nil {
klog.Warningf("unable to set driver-ready=true label on the node, error: %v", err)
}
return d.server.Serve(listener)
}

func (d *GCSDriver) stop() {
d.server.Stop()
if err := util.SetDriverReadyLabel(d.name, d.nodeName, false); err != nil {
klog.Warningf("Unable to set driver-ready=false label on the node, error: %v", err)
}
klog.V(1).Info("CSI driver stopped")
}

Expand All @@ -93,7 +104,7 @@ func (d *GCSDriver) RunPodCleanup() (err error) {
// Killing Pod because its Volume is no longer mounted
err = util.DeletePod(publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name)
if err == nil {
klog.V(4).Infof("Deleted Pod %s/%s bacause its volume was no longer mounted", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name)
klog.V(4).Infof("Deleted Pod %s/%s because its volume was no longer mounted", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name)
} else {
klog.Errorf("Could not delete pod %s/%s because it was no longer mounted because of error: %v", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name, err)
}
Expand Down
44 changes: 44 additions & 0 deletions pkg/util/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package util

import (
"context"
"encoding/json"
"fmt"
"hash/crc32"
"io/ioutil"
Expand All @@ -20,6 +21,7 @@ import (
"google.golang.org/grpc/status"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/klog"
Expand Down Expand Up @@ -185,6 +187,48 @@ func GetPvcAnnotations(pvcName string, pvcNamespace string) (annotations map[str
return pvc.ObjectMeta.Annotations, nil
}

// DriverReadyLabel returns the driver-ready label according to the driver name.
func DriverReadyLabel(driverName string) string {
return driverName + "/driver-ready"
}

// DriverReadyLabelJSONPatchEscaped returns the driver-ready label according to the driver name but espcaed to be used in a JSONPatch path.
func DriverReadyLabelJSONPatchEscaped(driverName string) string {
return strings.ReplaceAll(DriverReadyLabel(driverName), "/", "~1")
}

// SetDriverReadyLabel set the label <driver name>/driver-ready=<isReady> on the given node.
func SetDriverReadyLabel(driverName string, nodeName string, isReady bool) (err error) {
config, err := rest.InClusterConfig()
if err != nil {
return err
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return err
}

patch := []struct {
Op string `json:"op"`
Path string `json:"path"`
Value string `json:"value"`
}{{
Op: "add",
Path: "/metadata/labels/" + DriverReadyLabelJSONPatchEscaped(driverName),
Value: strconv.FormatBool(isReady),
}}
patchBytes, err := json.Marshal(patch)
if err != nil {
return err
}

_, err = clientset.CoreV1().Nodes().Patch(nodeName, types.JSONPatchType, patchBytes)
if err != nil {
return err
}
return nil
}

func DeletePod(namespace string, name string) (err error) {
config, err := rest.InClusterConfig()
if err != nil {
Expand Down

0 comments on commit 5ebbc41

Please sign in to comment.