Add active and queued requests to async dashboard (#2326)

deliahu · Miguel Varela Ramos · commit 37d70b398052 · 2021-07-16T15:36:39.000+02:00
diff --git a/dev/prometheus.md b/dev/prometheus.md
@@ -14,7 +14,13 @@ The following is a list of metrics that are currently in use.
     1. api_name
     1. api_kind
     1. status_code
-1. cortex_async_queue_length with the following labels:
+1. cortex_async_active with the following labels:
+    1. api_name
+    1. api_kind
+1. cortex_async_queued with the following labels:
+    1. api_name
+    1. api_kind
+1. cortex_async_in_flight with the following labels:
     1. api_name
     1. api_kind
 1. cortex_async_latency_bucket with the following labels:
diff --git a/manager/manifests/grafana/grafana-dashboard-async.yaml b/manager/manifests/grafana/grafana-dashboard-async.yaml
@@ -36,7 +36,7 @@ data:
       "editable": true,
       "gnetId": null,
       "graphTooltip": 0,
-      "iteration": 1625168772532,
+      "iteration": 1625805144458,
       "links": [],
       "panels": [
         {
@@ -61,10 +61,6 @@ data:
         {
           "collapsed": false,
           "datasource": null,
-          "fieldConfig": {
-            "defaults": {},
-            "overrides": []
-          },
           "gridPos": {
             "h": 1,
             "w": 24,
@@ -175,7 +171,7 @@ data:
           "dashLength": 10,
           "dashes": false,
           "datasource": null,
-          "description": "Active in-flight requests for an API.\n\nNote: In-flight requests are recorded every 10 seconds, which will correspond to the minimum resolution.",
+          "description": "In-flight requests for an API.\n\nNote: In-flight requests are recorded every 10 seconds, which will correspond to the minimum resolution.",
           "fill": 1,
           "fillGradient": 0,
           "gridPos": {
@@ -213,10 +209,28 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(cortex_async_queue_length{api_kind=\"AsyncAPI\",api_name=\"$api_name\"}) by (api_name)",
+              "exemplar": true,
+              "expr": "sum(cortex_async_active{api_kind=\"AsyncAPI\",api_name=\"$api_name\"}) by (api_name)",
+              "hide": false,
               "interval": "",
-              "legendFormat": "{{api_name}}",
-              "refId": "A"
+              "legendFormat": "active",
+              "refId": "Active"
+            },
+            {
+              "exemplar": true,
+              "expr": "sum(cortex_async_queued{api_kind=\"AsyncAPI\",api_name=\"$api_name\"}) by (api_name)",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "queued",
+              "refId": "Queued"
+            },
+            {
+              "exemplar": true,
+              "expr": "sum(cortex_async_in_flight{api_kind=\"AsyncAPI\",api_name=\"$api_name\"}) by (api_name)",
+              "hide": true,
+              "interval": "",
+              "legendFormat": "in flight",
+              "refId": "In Flight"
             }
           ],
           "thresholds": [],
@@ -1014,10 +1028,6 @@ data:
         {
           "collapsed": false,
           "datasource": null,
-          "fieldConfig": {
-            "defaults": {},
-            "overrides": []
-          },
           "gridPos": {
             "h": 1,
             "w": 24,
@@ -1445,10 +1455,6 @@ data:
         {
           "collapsed": false,
           "datasource": null,
-          "fieldConfig": {
-            "defaults": {},
-            "overrides": []
-          },
           "gridPos": {
             "h": 1,
             "w": 24,
diff --git a/pkg/autoscaler/async_scaler.go b/pkg/autoscaler/async_scaler.go
@@ -70,11 +70,11 @@ func (s *AsyncScaler) GetInFlightRequests(apiName string, window time.Duration)
 	windowSeconds := int64(window.Seconds())
 
 	// PromQL query:
-	// 	sum(sum_over_time(cortex_async_queue_length{api_name="<apiName>"}[60s])) /
-	//	sum(count_over_time(cortex_async_queue_length{api_name="<apiName>"}[60s]))
+	// 	sum(sum_over_time(cortex_async_in_flight{api_name="<apiName>"}[60s])) /
+	//	sum(count_over_time(cortex_async_in_flight{api_name="<apiName>"}[60s]))
 	query := fmt.Sprintf(
-		"sum(sum_over_time(cortex_async_queue_length{api_name=\"%s\"}[%ds])) / "+
-			"max(count_over_time(cortex_async_queue_length{api_name=\"%s\"}[%ds]))",
+		"sum(sum_over_time(cortex_async_in_flight{api_name=\"%s\"}[%ds])) / "+
+			"max(count_over_time(cortex_async_in_flight{api_name=\"%s\"}[%ds]))",
 		apiName, windowSeconds,
 		apiName, windowSeconds,
 	)
diff --git a/pkg/health/health.go b/pkg/health/health.go
@@ -215,9 +215,7 @@ func Check(awsClient *awslib.Client, k8sClient *k8s.Client, clusterName string)
 }
 
 func GetWarnings(k8sClient *k8s.Client) (ClusterWarnings, error) {
-	var (
-		prometheusMemorySaturationWarn string
-	)
+	var prometheusMemorySaturationWarn string
 
 	saturation, err := getPodMemorySaturation(k8sClient, "prometheus-prometheus-0", "default")
 	if err != nil {
diff --git a/pkg/operator/resources/asyncapi/queue_metrics.go b/pkg/operator/resources/asyncapi/queue_metrics.go
@@ -33,10 +33,26 @@ const (
 	_sqsQueryTimeoutSeconds = 10
 )
 
-var queueLengthGauge = promauto.NewGaugeVec(
+var activeGauge = promauto.NewGaugeVec(
 	prometheus.GaugeOpts{
-		Name:        "cortex_async_queue_length",
-		Help:        "The number of in-queue messages for a cortex AsyncAPI",
+		Name:        "cortex_async_active",
+		Help:        "The number of messages that are actively being processed by an AsyncAPI",
+		ConstLabels: map[string]string{"api_kind": userconfig.AsyncAPIKind.String()},
+	}, []string{"api_name"},
+)
+
+var queuedGauge = promauto.NewGaugeVec(
+	prometheus.GaugeOpts{
+		Name:        "cortex_async_queued",
+		Help:        "The number queued messages for an AsyncAPI",
+		ConstLabels: map[string]string{"api_kind": userconfig.AsyncAPIKind.String()},
+	}, []string{"api_name"},
+)
+
+var inFlightGauge = promauto.NewGaugeVec(
+	prometheus.GaugeOpts{
+		Name:        "cortex_async_in_flight",
+		Help:        "The number of in-flight messages for an AsyncAPI (including active and queued)",
 		ConstLabels: map[string]string{"api_kind": userconfig.AsyncAPIKind.String()},
 	}, []string{"api_name"},
 )
@@ -74,8 +90,9 @@ func updateQueueLengthMetricsFn(apiName, queueURL string) func() error {
 			return err
 		}
 
-		queueLength := visibleMessages + invisibleMessages
-		queueLengthGauge.WithLabelValues(apiName).Set(queueLength)
+		activeGauge.WithLabelValues(apiName).Set(invisibleMessages)
+		queuedGauge.WithLabelValues(apiName).Set(visibleMessages)
+		inFlightGauge.WithLabelValues(apiName).Set(invisibleMessages + visibleMessages)
 
 		return nil
 	}

Original file line number	Diff line number	Diff line change
`@@ -215,9 +215,7 @@ func Check(awsClient awslib.Client, k8sClient k8s.Client, clusterName string)`
`215`	`215`	`}`
`216`	`216`
`217`	`217`	`func GetWarnings(k8sClient *k8s.Client) (ClusterWarnings, error) {`
`218`		`- var (`
`219`		`- prometheusMemorySaturationWarn string`
`220`		`- )`
	`218`	`+ var prometheusMemorySaturationWarn string`
`221`	`219`
`222`	`220`	`saturation, err := getPodMemorySaturation(k8sClient, "prometheus-prometheus-0", "default")`
`223`	`221`	`if err != nil {`