From cdcd97244c4eb381449acd9572a73ceb2434835e Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 17 Oct 2019 11:36:25 +0300 Subject: [PATCH 1/3] Add the metrics server field to CRD --- artifacts/flagger/crd.yaml | 3 +++ charts/flagger/templates/crd.yaml | 3 +++ kustomize/base/flagger/crd.yaml | 3 +++ pkg/apis/flagger/v1alpha3/types.go | 4 ++++ 4 files changed, 13 insertions(+) diff --git a/artifacts/flagger/crd.yaml b/artifacts/flagger/crd.yaml index 3adf4e073..1a19c283f 100644 --- a/artifacts/flagger/crd.yaml +++ b/artifacts/flagger/crd.yaml @@ -68,6 +68,9 @@ spec: provider: description: Traffic managent provider type: string + metricsServer: + description: Prometheus URL + type: string progressDeadlineSeconds: description: Deployment progress deadline type: number diff --git a/charts/flagger/templates/crd.yaml b/charts/flagger/templates/crd.yaml index e6bc20b95..944386f61 100644 --- a/charts/flagger/templates/crd.yaml +++ b/charts/flagger/templates/crd.yaml @@ -69,6 +69,9 @@ spec: provider: description: Traffic managent provider type: string + metricsServer: + description: Prometheus URL + type: string progressDeadlineSeconds: description: Deployment progress deadline type: number diff --git a/kustomize/base/flagger/crd.yaml b/kustomize/base/flagger/crd.yaml index 3adf4e073..1a19c283f 100644 --- a/kustomize/base/flagger/crd.yaml +++ b/kustomize/base/flagger/crd.yaml @@ -68,6 +68,9 @@ spec: provider: description: Traffic managent provider type: string + metricsServer: + description: Prometheus URL + type: string progressDeadlineSeconds: description: Deployment progress deadline type: number diff --git a/pkg/apis/flagger/v1alpha3/types.go b/pkg/apis/flagger/v1alpha3/types.go index a8004b2b4..adf4296f5 100644 --- a/pkg/apis/flagger/v1alpha3/types.go +++ b/pkg/apis/flagger/v1alpha3/types.go @@ -51,6 +51,10 @@ type CanarySpec struct { // +optional Provider string `json:"provider,omitempty"` + // if specified overwrites the -metrics-server flag for this particular canary + // +optional + MetricsServer string `json:"metricsServer,omitempty"` + // reference to target resource TargetRef hpav1.CrossVersionObjectReference `json:"targetRef"` From d6c5bdd24102146ae34a825a5a4767dc3ca50411 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 17 Oct 2019 11:37:54 +0300 Subject: [PATCH 2/3] Implement metrics server override --- pkg/controller/scheduler.go | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index 8eeb4c611..62b2ed18f 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -2,6 +2,7 @@ package controller import ( "fmt" + "github.com/weaveworks/flagger/pkg/metrics" "strings" "time" @@ -749,7 +750,21 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { } // create observer based on the mesh provider - observer := c.observerFactory.Observer(metricsProvider) + observerFactory := c.observerFactory + observer := observerFactory.Observer(metricsProvider) + + // override the global metrics server if one is specified in the canary spec + metricsServer := c.observerFactory.Client.GetMetricsServer() + if r.Spec.MetricsServer != "" { + metricsServer = r.Spec.MetricsServer + var err error + observerFactory, err = metrics.NewFactory(metricsServer, metricsProvider, 5*time.Second) + if err != nil { + c.recordEventErrorf(r, "Error building Prometheus client for %s %v", r.Spec.MetricsServer, err) + return false + } + observer = observerFactory.Observer(metricsProvider) + } // run metrics checks for _, metric := range r.Spec.CanaryAnalysis.Metrics { @@ -764,7 +779,7 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic", metric.Name, r.Spec.TargetRef.Name, r.Namespace) } else { - c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observerFactory.Client.GetMetricsServer(), err) + c.recordEventErrorf(r, "Metrics server %s query failed: %v", metricsServer, err) } return false } @@ -784,7 +799,7 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic", metric.Name, r.Spec.TargetRef.Name, r.Namespace) } else { - c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observerFactory.Client.GetMetricsServer(), err) + c.recordEventErrorf(r, "Metrics server %s query failed: %v", metricsServer, err) } return false } @@ -800,13 +815,13 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { // custom checks if metric.Query != "" { - val, err := c.observerFactory.Client.RunQuery(metric.Query) + val, err := observerFactory.Client.RunQuery(metric.Query) if err != nil { if strings.Contains(err.Error(), "no values found") { c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic", metric.Name, r.Spec.TargetRef.Name, r.Namespace) } else { - c.recordEventErrorf(r, "Metrics server %s query failed for %s: %v", c.observerFactory.Client.GetMetricsServer(), metric.Name, err) + c.recordEventErrorf(r, "Metrics server %s query failed for %s: %v", metricsServer, metric.Name, err) } return false } From 5e434df6ea7887cb0829a121750f7aa9e5ac2fb3 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Thu, 17 Oct 2019 12:35:55 +0300 Subject: [PATCH 3/3] Exclude high cardinality cAdvisor metrics --- charts/flagger/templates/prometheus.yaml | 30 ++++++++------------ kustomize/base/prometheus/prometheus.yml | 36 +++++++++--------------- 2 files changed, 24 insertions(+), 42 deletions(-) diff --git a/charts/flagger/templates/prometheus.yaml b/charts/flagger/templates/prometheus.yaml index c49d3f96b..8b5740a9d 100644 --- a/charts/flagger/templates/prometheus.yaml +++ b/charts/flagger/templates/prometheus.yaml @@ -133,38 +133,22 @@ data: scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: kubernetes;https - # Scrape config for nodes - - job_name: 'kubernetes-nodes' - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics - # scrape config for cAdvisor - job_name: 'kubernetes-cadvisor' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - - role: node + - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) @@ -174,6 +158,14 @@ data: regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + # exclude high cardinality metrics + metric_relabel_configs: + - source_labels: [__name__] + regex: (container|machine)_(cpu|memory|network|fs)_(.+) + action: keep + - source_labels: [__name__] + regex: container_memory_failures_total + action: drop # scrape config for pods - job_name: kubernetes-pods diff --git a/kustomize/base/prometheus/prometheus.yml b/kustomize/base/prometheus/prometheus.yml index 2cb993e53..a73b09e75 100644 --- a/kustomize/base/prometheus/prometheus.yml +++ b/kustomize/base/prometheus/prometheus.yml @@ -2,12 +2,11 @@ global: scrape_interval: 5s scrape_configs: -# Scrape config for AppMesh Envoy sidecar +# scrape config for AppMesh Envoy sidecar - job_name: 'appmesh-envoy' metrics_path: /stats/prometheus kubernetes_sd_configs: - role: pod - relabel_configs: - source_labels: [__meta_kubernetes_pod_container_name] action: keep @@ -25,8 +24,7 @@ scrape_configs: - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name - - # Exclude high cardinality metrics + # exclude high cardinality metrics metric_relabel_configs: - source_labels: [ cluster_name ] regex: '(outbound|inbound|prometheus_stats).*' @@ -56,7 +54,7 @@ scrape_configs: regex: 'envoy_cluster_(lb|retry|bind|internal|max|original).*' action: drop -# Scrape config for API servers +# scrape config for API servers - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints @@ -66,35 +64,19 @@ scrape_configs: scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: kubernetes;https -# Scrape config for nodes -- job_name: 'kubernetes-nodes' - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics - # scrape config for cAdvisor - job_name: 'kubernetes-cadvisor' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node @@ -107,6 +89,14 @@ scrape_configs: regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + # exclude high cardinality metrics + metric_relabel_configs: + - source_labels: [__name__] + regex: (container|machine)_(cpu|memory|network|fs)_(.+) + action: keep + - source_labels: [__name__] + regex: container_memory_failures_total + action: drop # scrape config for pods - job_name: kubernetes-pods