diff --git a/README.md b/README.md index 1e674a54ee..fdf8f12a46 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ Note that the external-provisioner does not scale with more replicas. Only one e * `--cloning-protection-threads `: Number of simultaneously running threads, handling cloning finalizer removal. Defaults to `1`. -* `--metrics-address`: The TCP network address where the prometheus metrics endpoint will run (example: `:8080` which corresponds to port 8080 on local host). The default is empty string, which means metrics endpoint is disabled. +* `--http-endpoint`: The TCP network address where the HTTP server for diagnostics, including metrics and leader election health check, will listen (example: `:8080` which corresponds to port 8080 on local host). The default is empty string, which means the server is disabled. * `--metrics-path`: The HTTP path where prometheus metrics will be exposed. Default is `/metrics`. @@ -95,6 +95,8 @@ See the [storage capacity section](#capacity-support) below for details. * `--master `: Master URL to build a client config from. When omitted, default token provided by Kubernetes will be used. This option is useful only when the external-provisioner does not run as a Kubernetes pod, e.g. for debugging. Either this or `--kubeconfig` needs to be set if the external-provisioner is being run out of cluster. +* `--metrics-address`: (deprecated) The TCP network address where the prometheus metrics endpoint will run (example: `:8080` which corresponds to port 8080 on local host). The default is empty string, which means metrics endpoint is disabled. + * `--volume-name-prefix `: Prefix of PersistentVolume names created by the external-provisioner. Default value is "pvc", i.e. created PersistentVolume objects will have name `pvc-`. * `--volume-name-uuid-length`: Length of UUID to be added to `--volume-name-prefix`. Default behavior is to NOT truncate the UUID. @@ -233,6 +235,13 @@ Details of error handling of individual CSI calls: * `Probe`: The external-provisioner retries calling Probe until the driver reports it's ready. It retries also when it receives timeout from `Probe` call. The external-provisioner has no limit of retries. It is expected that ReadinessProbe on the driver container will catch case when the driver takes too long time to get ready. * `GetPluginInfo`, `GetPluginCapabilitiesRequest`, `ControllerGetCapabilities`: The external-provisioner expects that these calls are quick and does not retry them on any error, including timeout. Instead, it assumes that the driver is faulty and exits. Note that Kubernetes will likely start a new provisioner container and it will start with `Probe` call. +### HTTP endpoint + +The external-provisioner optionally exposes an HTTP endpoint at address:port specified by `--http-endpoint` argument. When set, these two paths are exposed: + +* Metrics path, as set by `--metrics-path` argument (default is `/metrics`). +* Leader election health check at `/healthz/leader-election`. It is recommended to run a liveness probe against this endpoint when leader election is used to kill external-provisioner leader that fails to connect to the API server to renew its leadership. See https://github.com/kubernetes-csi/csi-lib-utils/issues/66 for details. + ## Community, discussion, contribution, and support Learn how to engage with the Kubernetes community on the [community page](http://kubernetes.io/community/). diff --git a/cmd/csi-provisioner/csi-provisioner.go b/cmd/csi-provisioner/csi-provisioner.go index 81791f9b13..f8f3628737 100644 --- a/cmd/csi-provisioner/csi-provisioner.go +++ b/cmd/csi-provisioner/csi-provisioner.go @@ -21,6 +21,7 @@ import ( goflag "flag" "fmt" "math/rand" + "net/http" "os" "strconv" "strings" @@ -71,9 +72,9 @@ var ( strictTopology = flag.Bool("strict-topology", false, "Late binding: pass only selected node topology to CreateVolume Request, unlike default behavior of passing aggregated cluster topologies that match with topology keys of the selected node.") immediateTopology = flag.Bool("immediate-topology", true, "Immediate binding: pass aggregated cluster topologies for all nodes where the CSI driver is available (enabled, the default) or no topology requirements (if disabled).") extraCreateMetadata = flag.Bool("extra-create-metadata", false, "If set, add pv/pvc metadata to plugin create requests as parameters.") - - metricsAddress = flag.String("metrics-address", "", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled.") - metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.") + metricsAddress = flag.String("metrics-address", "", "(deprecated) The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled. Only one of `--metrics-address` and `--http-endpoint` can be set.") + httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics and leader election health check, will listen (example: `:8080`). The default is empty string, which means the server is disabled. Only one of `--metrics-address` and `--http-endpoint` can be set.") + metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.") defaultFSType = flag.String("default-fstype", "", "The default filesystem type of the volume to provision when fstype is unspecified in the StorageClass. If the default is not set and fstype is unset in the StorageClass, then no fstype will be set") @@ -121,6 +122,15 @@ func main() { } klog.Infof("Version: %s", version) + if *metricsAddress != "" && *httpEndpoint != "" { + klog.Error("only one of `--metrics-address` and `--http-endpoint` can be set.") + os.Exit(1) + } + addr := *metricsAddress + if addr == "" { + addr = *httpEndpoint + } + // get the KUBECONFIG from env if specified (useful for local/debug cluster) kubeconfigEnv := os.Getenv("KUBECONFIG") @@ -181,8 +191,20 @@ func main() { klog.Fatalf("Error getting CSI driver name: %s", err) } klog.V(2).Infof("Detected CSI driver %s", provisionerName) - metricsManager.SetDriverName(provisionerName) - metricsManager.StartMetricsEndpoint(*metricsAddress, *metricsPath) + + // Prepare http endpoint for metrics + leader election healthz + mux := http.NewServeMux() + if addr != "" { + metricsManager.RegisterToServer(mux, *metricsPath) + metricsManager.SetDriverName(provisionerName) + go func() { + klog.Infof("ServeMux listening at %q", addr) + err := http.ListenAndServe(addr, mux) + if err != nil { + klog.Fatalf("Failed to start HTTP server at specified address (%q) and metrics path (%q): %s", addr, *metricsPath, err) + } + }() + } pluginCapabilities, controllerCapabilities, err := ctrl.GetDriverCapabilities(grpcClient, *operationTimeout) if err != nil { @@ -373,6 +395,9 @@ func main() { } le := leaderelection.NewLeaderElection(leClientset, lockName, run) + if *httpEndpoint != "" { + le.PrepareHealthCheck(mux, leaderelection.DefaultHealthCheckTimeout) + } if *leaderElectionNamespace != "" { le.WithNamespace(*leaderElectionNamespace) diff --git a/deploy/kubernetes/deployment.yaml b/deploy/kubernetes/deployment.yaml index 77a10348f0..f6f7e1ca06 100644 --- a/deploy/kubernetes/deployment.yaml +++ b/deploy/kubernetes/deployment.yaml @@ -19,10 +19,11 @@ spec: serviceAccount: csi-provisioner containers: - name: csi-provisioner - image: quay.io/k8scsi/csi-provisioner:canary + image: gcr.io/k8s-staging-sig-storage/csi-provisioner:canary args: - "--csi-address=$(ADDRESS)" - "--leader-election" + - "--http-endpoint=:8080" env: - name: ADDRESS value: /var/lib/csi/sockets/pluginproxy/mock.socket @@ -30,7 +31,18 @@ spec: volumeMounts: - name: socket-dir mountPath: /var/lib/csi/sockets/pluginproxy/ - + ports: + - containerPort: 8080 + name: http-endpoint + protocol: TCP + livenessProbe: + failureThreshold: 1 + httpGet: + path: /healthz/leader-election + port: http-endpoint + initialDelaySeconds: 10 + timeoutSeconds: 10 + periodSeconds: 20 - name: mock-driver image: quay.io/k8scsi/mock-driver:canary env: