Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Advertise the readiness of the driver on each node. #72

Merged
merged 6 commits into from
May 14, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deploy/base/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ rules:
verbs: ["delete"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "update"]
verbs: ["get", "list", "update", "patch"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
Expand Down
45 changes: 43 additions & 2 deletions docs/troubleshooting.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,46 @@
# Troubleshooting
## Warnings events from pods scheduled on newly started nodes when mounting csi-gcs volumes

-----
Warnings, like the one below, can be seen from pods scheduled on newly started nodes.

## TODO
```
MountVolume.MountDevice failed for volume "xxxx" : kubernetes.io/csi: attacher.MountDevice failed to create newCsiDriverClient: driver name gcs.csi.ofek.dev not found in the list of registered CSI drivers
```

Those warnings are temporary and reflect the csi-gcs driver is still starting. Kubernetes will retry until the csi-gcs driver is ready.

It's possible to avoid those warnings by adding a node selector or affinity using the node label `gcs.csi.ofek.dev/driver-ready=true`.

> Adding such node selector or affinity will trade the time spend waiting for volume mounting retries with time waiting for scheduling.


```
apiVersion: v1
kind: Pod
metadata:
name: pod-mount-csi-gcs-volume
spec:
// ...
nodeSelector:
gcs.csi.ofek.dev/driver-ready: "true"
```

```
apiVersion: v1
kind: Pod
metadata:
name: pod-mount-csi-gcs-volume
spec:
// ...
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: gcs.csi.ofek.dev/driver-ready
operator: In
values:
- "true"
```

You can also add an admission mutating webhook to automatically inject such node selector or affinity in all pods mounting csi-gcs volumes.
13 changes: 12 additions & 1 deletion pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ func NewGCSDriver(name, node, endpoint string, version string, deleteOrphanedPod
}

func (d *GCSDriver) Run() error {
// set the driver-ready label to false at the beginning to handle edge-case where the controller didn't terminated gracefully
if err := util.SetDriverReadyLabel(d.nodeName, false); err != nil {
klog.Warningf("Unable to set driver-ready=false label on the node, error: %v", err)
}

if len(d.mountPoint) == 0 {
return errors.New("--bucket-mount-path is required")
}
Expand Down Expand Up @@ -75,11 +80,17 @@ func (d *GCSDriver) Run() error {
csi.RegisterIdentityServer(d.server, d)
csi.RegisterNodeServer(d.server, d)
csi.RegisterControllerServer(d.server, d)
if err = util.SetDriverReadyLabel(d.nodeName, true); err != nil {
klog.Warningf("unable to set driver-ready=true label on the node, error: %v", err)
}
return d.server.Serve(listener)
}

func (d *GCSDriver) stop() {
d.server.Stop()
if err := util.SetDriverReadyLabel(d.nodeName, false); err != nil {
klog.Warningf("Unable to set driver-ready=false label on the node, error: %v", err)
}
klog.V(1).Info("CSI driver stopped")
}

Expand All @@ -93,7 +104,7 @@ func (d *GCSDriver) RunPodCleanup() (err error) {
// Killing Pod because its Volume is no longer mounted
err = util.DeletePod(publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name)
if err == nil {
klog.V(4).Infof("Deleted Pod %s/%s bacause its volume was no longer mounted", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name)
klog.V(4).Infof("Deleted Pod %s/%s because its volume was no longer mounted", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name)
} else {
klog.Errorf("Could not delete pod %s/%s because it was no longer mounted because of error: %v", publishedVolume.Spec.Pod.Namespace, publishedVolume.Spec.Pod.Name, err)
}
Expand Down
34 changes: 34 additions & 0 deletions pkg/util/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package util

import (
"context"
"encoding/json"
"fmt"
"hash/crc32"
"io/ioutil"
Expand All @@ -20,6 +21,7 @@ import (
"google.golang.org/grpc/status"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/klog"
Expand Down Expand Up @@ -185,6 +187,38 @@ func GetPvcAnnotations(pvcName string, pvcNamespace string) (annotations map[str
return pvc.ObjectMeta.Annotations, nil
}

// SetDriverReadyLabel set the label gcs.csi.ofek.dev/driver-ready=<isReady> on the given node.
func SetDriverReadyLabel(nodeName string, isReady bool) (err error) {
config, err := rest.InClusterConfig()
if err != nil {
return err
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return err
}

patch := []struct {
Op string `json:"op"`
Path string `json:"path"`
Value string `json:"value"`
}{{
Op: "replace",
Path: "/metadata/labels/gcs.csi.ofek.dev~1driver-ready",
Value: strconv.FormatBool(isReady),
}}
patchBytes, err := json.Marshal(patch)
if err != nil {
return err
}

_, err = clientset.CoreV1().Nodes().Patch(nodeName, types.JSONPatchType, patchBytes)
if err != nil {
return err
}
return nil
}

func DeletePod(namespace string, name string) (err error) {
config, err := rest.InClusterConfig()
if err != nil {
Expand Down