Skip to content
This repository has been archived by the owner on Feb 27, 2023. It is now read-only.

monitoring: use histograms for api latency and cycle time metrics #164

Merged
merged 2 commits into from
Jun 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 205 additions & 16 deletions deployment/grafana/02-grafana-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ data:
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1528906914777,
"iteration": 1529959091609,
"links": [],
"panels": [
{
Expand Down Expand Up @@ -1224,10 +1224,10 @@ data:
"x": 6,
"y": 20
},
"id": 13,
"id": 25,
"legend": {
"alignAsTable": false,
"avg": false,
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
Expand All @@ -1250,18 +1250,24 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "avg(gimbal_discoverer_api_latency_ms{backendname=~\"$Backend\"}) by (backendname)",
"expr": "histogram_quantile(0.5, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*loadbalancers.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
"legendFormat": "{{backendname}}",
"legendFormat": "{{kubernetes_pod_name}} 50%",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*loadbalancers.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 99%",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Openstack API Latency",
"title": "Openstack API Latency: Load Balancers Endpoint",
"tooltip": {
"shared": true,
"sort": 0,
Expand Down Expand Up @@ -1300,18 +1306,196 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"description": "Time to process all items within a cluster",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 12,
"y": 20
},
"id": 13,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.5, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*pools.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 50%",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*pools.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 99%",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Openstack API Latency: Pools Endpoint",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ms",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"y": 20
},
"id": 26,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.5, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*listeners.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 50%",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*listeners.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 99%",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Openstack API Latency: Listeners",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ms",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"description": "Time to process all items within a cluster",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 27
},
"id": 11,
"legend": {
"alignAsTable": true,
"avg": false,
"avg": true,
"current": true,
"max": false,
"min": false,
Expand All @@ -1333,13 +1517,18 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "gimbal_discoverer_cycle_duration_ms{backendname=~\"$Backend\"} ",
"expr": "histogram_quantile(0.5, sum(rate(gimbal_discoverer_cycle_duration_seconds_bucket{backendname=~\"$Backend\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{backendname}}",
"legendFormat": "{{kubernetes_pod_name}} 50%",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(gimbal_discoverer_cycle_duration_seconds_bucket{backendname=~\"$Backend\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 99%",
"refId": "B"
}
],
"thresholds": [],
Expand All @@ -1361,7 +1550,7 @@ data:
},
"yaxes": [
{
"format": "ms",
"format": "s",
"label": null,
"logBase": 1,
"max": null,
Expand Down Expand Up @@ -1459,7 +1648,7 @@ data:
"timezone": "",
"title": "Gimbal Discovery",
"uid": "ex4WqmZmk",
"version": 21
"version": 2
}
envoy.json: |
{
Expand Down
56 changes: 29 additions & 27 deletions discovery/pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,21 @@ type DiscovererMetrics struct {
}

const (
ServiceEventTimestampGauge = "gimbal_service_event_timestamp"
EndpointsEventTimestampGauge = "gimbal_endpoints_event_timestamp"
ServiceErrorTotalCounter = "gimbal_service_error_total"
EndpointsErrorTotalCounter = "gimbal_endpoints_error_total"
QueueSizeGauge = "gimbal_queuesize"
DiscovererAPILatencyMSGauge = "gimbal_discoverer_api_latency_ms"
DiscovererCycleDurationMSGauge = "gimbal_discoverer_cycle_duration_ms"
DiscovererErrorTotal = "gimbal_discoverer_error_total"
DiscovererUpstreamServicesGauge = "gimbal_discoverer_upstream_services_total"
DiscovererReplicatedServicesGauge = "gimbal_discoverer_replicated_services_total"
DiscovererInvalidServicesGauge = "gimbal_discoverer_invalid_services_total"
DiscovererUpstreamEndpointsGauge = "gimbal_discoverer_upstream_endpoints_total"
DiscovererReplicatedEndpointsGauge = "gimbal_discoverer_replicated_endpoints_total"
DiscovererInvalidEndpointsGauge = "gimbal_discoverer_invalid_endpoints_total"
DiscovererInfoGauge = "gimbal_discoverer_info"
ServiceEventTimestampGauge = "gimbal_service_event_timestamp"
EndpointsEventTimestampGauge = "gimbal_endpoints_event_timestamp"
ServiceErrorTotalCounter = "gimbal_service_error_total"
EndpointsErrorTotalCounter = "gimbal_endpoints_error_total"
QueueSizeGauge = "gimbal_queuesize"
DiscovererAPILatencyMsHistogram = "gimbal_discoverer_api_latency_milliseconds"
DiscovererCycleDurationSecondsHistogram = "gimbal_discoverer_cycle_duration_seconds"
DiscovererErrorTotal = "gimbal_discoverer_error_total"
DiscovererUpstreamServicesGauge = "gimbal_discoverer_upstream_services_total"
DiscovererReplicatedServicesGauge = "gimbal_discoverer_replicated_services_total"
DiscovererInvalidServicesGauge = "gimbal_discoverer_invalid_services_total"
DiscovererUpstreamEndpointsGauge = "gimbal_discoverer_upstream_endpoints_total"
DiscovererReplicatedEndpointsGauge = "gimbal_discoverer_replicated_endpoints_total"
DiscovererInvalidEndpointsGauge = "gimbal_discoverer_invalid_endpoints_total"
DiscovererInfoGauge = "gimbal_discoverer_info"
)

// NewMetrics returns a map of Prometheus metrics
Expand Down Expand Up @@ -89,17 +89,19 @@ func NewMetrics(BackendType, BackendName string) DiscovererMetrics {
},
[]string{"backendname", "backendtype"},
),
DiscovererAPILatencyMSGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: DiscovererAPILatencyMSGauge,
Help: "The milliseconds it takes for requests to return from a remote discoverer api",
DiscovererAPILatencyMsHistogram: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: DiscovererAPILatencyMsHistogram,
Help: "The milliseconds it takes for requests to return from a remote discoverer api",
Buckets: []float64{20, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 20000, 50000, 120000}, // milliseconds. largest bucket is 2 minutes.
},
[]string{"backendname", "backendtype", "path"},
),
DiscovererCycleDurationMSGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: DiscovererCycleDurationMSGauge,
Help: "The milliseconds it takes for all objects to be synced from a remote discoverer api",
DiscovererCycleDurationSecondsHistogram: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: DiscovererCycleDurationSecondsHistogram,
Help: "The seconds it takes for all objects to be synced from a remote backend",
Buckets: prometheus.LinearBuckets(60, 60, 10), // 10 buckets, each 30 wide
},
[]string{"backendname", "backendtype"},
),
Expand Down Expand Up @@ -228,17 +230,17 @@ func (d *DiscovererMetrics) QueueSizeGaugeMetric(size int) {

// CycleDurationMetric formats a cycle duration gauge prometheus metric
func (d *DiscovererMetrics) CycleDurationMetric(duration time.Duration) {
m, ok := d.Metrics[DiscovererCycleDurationMSGauge].(*prometheus.GaugeVec)
m, ok := d.Metrics[DiscovererCycleDurationSecondsHistogram].(*prometheus.HistogramVec)
if ok {
m.WithLabelValues(d.BackendName, d.BackendType).Set(math.Floor(duration.Seconds() * 1e3))
m.WithLabelValues(d.BackendName, d.BackendType).Observe(math.Floor(duration.Seconds()))
}
}

// APILatencyMetric formats a cycle duration gauge prometheus metric
func (d *DiscovererMetrics) APILatencyMetric(path string, duration time.Duration) {
m, ok := d.Metrics[DiscovererAPILatencyMSGauge].(*prometheus.GaugeVec)
m, ok := d.Metrics[DiscovererAPILatencyMsHistogram].(*prometheus.HistogramVec)
if ok {
m.WithLabelValues(d.BackendName, d.BackendType, path).Set(math.Floor(duration.Seconds() * 1e3))
m.WithLabelValues(d.BackendName, d.BackendType, path).Observe(math.Floor(duration.Seconds() * 1e3))
}
}

Expand Down
4 changes: 2 additions & 2 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ Detailed documentation on stats within Envoy is available on their site: https:/
- **gimbal_queuesize (gauge):** Number of items in process queue with the following labels:
- backendname
- backendtype
- **gimbal_discoverer_api_latency_ms (gauge):** The milliseconds it takes for requests to return from a remote discoverer api (for example Openstack)
- **gimbal_discoverer_api_latency_milliseconds (histogram):** The milliseconds it takes for requests to return from a remote discoverer api (for example OpenStack)
- backendname
- backendtype
- path: API request path
- **gimbal_discoverer_cycle_duration_ms (gauge):** The milliseconds it takes for all objects to be synced from a remote discoverer api (for example Openstack)
- **gimbal_discoverer_cycle_duration_seconds (histogram):** The seconds it takes for all objects to be synced from a remote backend (for example OpenStack)
- backendname
- backendtype
- **gimbal_discoverer_api_error_total (counter):** Number of errors that have occurred when accessing the OpenStack API
Expand Down