Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve prometheus scalability, reduce labels/metrics and create an additional node group for prometheus only #2307

Merged
merged 30 commits into from
Jul 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
cc1842f
Experiment with dropping metrics/labels
RobertLucian Jun 28, 2021
ee7c31a
Fix the dropping of metrics/labels for the node exporter
RobertLucian Jun 28, 2021
4ed459e
Drop unnecessary metrics/labels from kubelet
RobertLucian Jun 28, 2021
146c4e8
Drop unnecessary kube-state-metrics metrics/labels
RobertLucian Jun 28, 2021
2f26497
Drop unnecessary metrics/labels from DCGM exporter
RobertLucian Jun 28, 2021
39a639a
Remove unnecessary metrics/labels from istio
RobertLucian Jun 29, 2021
d4739a0
Change labeldrop to labelkeep
RobertLucian Jun 29, 2021
488af33
Add development docs
RobertLucian Jun 29, 2021
ccc80ce
Fixes for node-exporter & prom monitoring
RobertLucian Jun 29, 2021
b382a1f
Fixes to the nodes dashboard
RobertLucian Jun 29, 2021
6527b40
Add missing `le` label for `istio_request_duration_milliseconds_bucke…
RobertLucian Jun 30, 2021
35c5bb7
Add required label for kube-state-metrics exporter
RobertLucian Jun 30, 2021
6ba99b9
Fix batch grafana dashboard
RobertLucian Jun 30, 2021
1526346
Merge branch 'master' into fix/prometheus-oom
RobertLucian Jul 2, 2021
416bed9
Keep cortex_* metrics
RobertLucian Jul 2, 2021
f465bc1
Separate prometheus and operator workloads
RobertLucian Jul 2, 2021
4289fc6
Validate operator/prometheus node group quotas
RobertLucian Jul 2, 2021
a5be71f
Address cluster info pricing
RobertLucian Jul 2, 2021
d0b22fc
Let the node exporter run on every node
RobertLucian Jul 2, 2021
dc79cbf
Fix istio hpa
RobertLucian Jul 2, 2021
d2c26e5
Change resource requests/limits
RobertLucian Jul 2, 2021
25f5166
Have the prometheus instance type configurable
RobertLucian Jul 2, 2021
21a5d9c
Nits
RobertLucian Jul 2, 2021
65781cb
Update create.md
deliahu Jul 2, 2021
8fc36b8
Merge branch 'master' into fix/prometheus-oom
RobertLucian Jul 2, 2021
dcff049
Address some PR comments
RobertLucian Jul 2, 2021
f64d0fa
Address PR comments
RobertLucian Jul 2, 2021
5641de5
Merge branch 'master' into fix/prometheus-oom
RobertLucian Jul 3, 2021
b1fd479
Merge branch 'master' into fix/prometheus-oom
RobertLucian Jul 3, 2021
5496bc6
Merge branch 'master' into fix/prometheus-oom
RobertLucian Jul 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 18 additions & 12 deletions cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -770,11 +770,12 @@ func cmdInfo(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, st
infoInterface = infoResponse.ClusterConfig.Config
} else {
infoInterface = map[string]interface{}{
"cluster_config": infoResponse.ClusterConfig.Config,
"cluster_metadata": infoResponse.ClusterConfig.OperatorMetadata,
"node_infos": infoResponse.NodeInfos,
"endpoint_operator": operatorEndpoint,
"endpoint_api": apiEndpoint,
"cluster_config": infoResponse.ClusterConfig.Config,
"cluster_metadata": infoResponse.ClusterConfig.OperatorMetadata,
"worker_node_infos": infoResponse.WorkerNodeInfos,
"operator_node_infos": infoResponse.OperatorNodeInfos,
"endpoint_operator": operatorEndpoint,
"endpoint_api": apiEndpoint,
}
}

Expand Down Expand Up @@ -848,6 +849,8 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
eksPrice := aws.EKSPrices[clusterConfig.Region]
operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
Expand Down Expand Up @@ -891,17 +894,20 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
totalNodeGroupsPrice += totalEBSPrice + totalInstancePrice
}

operatorNodeGroupPrice := float64(len(infoResponse.OperatorNodeInfos)) * (operatorInstancePrice + operatorEBSPrice)
prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice

var natTotalPrice float64
if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
natTotalPrice = natUnitPrice
} else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway {
natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones))
}
totalPrice := eksPrice + totalNodeGroupsPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + nlbPrice*2 + natTotalPrice
totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice
fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))

operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})

if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
Expand All @@ -919,11 +925,11 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
}

func printInfoNodes(infoResponse *schema.InfoResponse) {
numAPIInstances := len(infoResponse.NodeInfos)
numAPIInstances := len(infoResponse.WorkerNodeInfos)

var totalReplicas int
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
for _, nodeInfo := range infoResponse.NodeInfos {
for _, nodeInfo := range infoResponse.WorkerNodeInfos {
totalReplicas += nodeInfo.NumReplicas
if nodeInfo.ComputeUserCapacity.GPU > 0 {
doesClusterHaveGPUs = true
Expand All @@ -946,7 +952,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {

fmt.Printf(console.Bold("\nyour cluster has %d API %s running across %d %s%s\n"), totalReplicas, s.PluralS("replica", totalReplicas), numAPIInstances, s.PluralS("instance", numAPIInstances), pendingReplicasStr)

if len(infoResponse.NodeInfos) == 0 {
if len(infoResponse.WorkerNodeInfos) == 0 {
return
}

Expand All @@ -963,7 +969,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
}

var rows [][]interface{}
for _, nodeInfo := range infoResponse.NodeInfos {
for _, nodeInfo := range infoResponse.WorkerNodeInfos {
lifecycle := "on-demand"
if nodeInfo.IsSpot {
lifecycle = "spot"
Expand Down
10 changes: 7 additions & 3 deletions cli/cmd/lib_cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,9 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) {
eksPrice := aws.EKSPrices[clusterConfig.Region]
operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
Expand All @@ -184,7 +186,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})

ngNameToSpotInstancesUsed := map[string]int{}
fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + 2*nlbPrice + natTotalPrice
fixedPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
totalMinPrice := fixedPrice
totalMaxPrice := fixedPrice
for _, ng := range clusterConfig.NodeGroups {
Expand Down Expand Up @@ -223,8 +225,10 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr})
}

operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
operatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice
prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
rows = append(rows, []interface{}{"1 t3.medium instance (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})

if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
Expand Down
211 changes: 211 additions & 0 deletions dev/prometheus.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
# Metrics

## Updating metrics

When new metrics/labels/exporters are added to be scraped by prometheus, make sure the following list **is updated** as well to keep track of what metrics/labels are needed or not.

The following is a list of metrics that are currently in use.

#### Cortex metrics

1. cortex_in_flight_requests with the following labels:
1. api_name
1. cortex_async_request_count with the following labels:
1. api_name
1. api_kind
1. status_code
1. cortex_async_queue_length with the following labels:
1. api_name
1. api_kind
1. cortex_async_latency_bucket with the following labels:
1. api_name
1. api_kind
1. cortex_batch_succeeded with the following labels:
1. api_name
1. cortex_batch_failed with the following labels:
1. api_name
1. cortex_time_per_batch_sum with the following labels:
1. api_name
1. cortex_time_per_batch_count with the following labels:
1. api_name

#### Istio metrics

1. istio_requests_total with the following labels:
1. destination_service
1. response_code
1. istio_request_duration_milliseconds_bucket with the following labels:
1. destination_service
1. le
1. istio_request_duration_milliseconds_sum with the following labels:
1. destination_service
1. istio_request_duration_milliseconds_count with the following labels:
1. destination_service

#### Kubelet metrics
1. container_cpu_usage_seconds_total with the following labels:
1. pod
1. container
1. name
1. container_memory_working_set_bytes with the following labels:
1. pod
1. name
1. container

#### Kube-state-metrics metrics

1. kube_pod_container_resource_requests with the following labels:
1. exported_pod
1. resource
1. exported_container (required for not dropping the values for each container of each pod)
1. kube_pod_info with the following labels:
1. exported_pod
1. kube_deployment_status_replicas_available with the following labels:
1. deployment
1. kube_job_status_active with the following labels:
1. job_name

#### DCGM metrics

1. DCGM_FI_DEV_GPU_UTIL with the following labels:
1. exported_pod
1. DCGM_FI_DEV_FB_USED with the following labels:
1. exported_pod
1. DCGM_FI_DEV_FB_FREE with the following labels:
1. exported_pod

#### Node metrics

1. node_cpu_seconds_total with the following labels:
1. job
1. mode
1. instance
1. cpu
1. node_load1 with the following labels:
1. job
1. instance
1. node_load5 with the following labels:
1. job
1. instance
1. node_load15 with the following labels:
1. job
1. instance
1. node_exporter_build_info with the following labels:
1. job
1. instance
1. node_memory_MemTotal_bytes with the following labels:
1. job
1. instance
1. node_memory_MemFree_bytes with the following labels:
1. job
1. instance
1. node_memory_Buffers_bytes with the following labels:
1. job
1. instance
1. node_memory_Cached_bytes with the following labels:
1. job
1. instance
1. node_memory_MemAvailable_bytes with the following labels:
1. job
1. instance
1. node_disk_read_bytes_total with the following labels:
1. job
1. instance
1. device
1. node_disk_written_bytes_total with the following labels:
1. job
1. instance
1. device
1. node_disk_io_time_seconds_total with the following labels:
1. job
1. instance
1. device
1. node_filesystem_size_bytes with the following labels:
1. job
1. instance
1. fstype
1. mountpoint
1. device
1. node_filesystem_avail_bytes with the following labels:
1. job
1. instance
1. fstype
1. device
1. node_network_receive_bytes_total with the following labels:
1. job
1. instance
1. device
1. node_network_transmit_bytes_total with the following labels:
1. job
1. instance
1. device

##### Prometheus rules for the node exporter

1. instance:node_cpu_utilisation:rate1m from the following metrics:
1. node_cpu_seconds_total with the following labels:
1. job
1. mode
1. instance:node_num_cpu:sum from the following metrics:
1. node_cpu_seconds_total with the following labels:
1. job
1. instance:node_load1_per_cpu:ratio from the following metrics:
1. node_load1 with the following labels:
1. job
1. instance:node_memory_utilisation:ratio from the following metrics:
1. node_memory_MemTotal_bytes with the following labels:
1. job
1. node_memory_MemAvailable_bytes with the following labels:
1. job
1. instance:node_vmstat_pgmajfault:rate1m with the following metrics:
1. node_vmstat_pgmajfault with the following labels:
1. job
1. instance_device:node_disk_io_time_seconds:rate1m with the following metrics:
1. node_disk_io_time_seconds_total with the following labels:
1. job
1. device
1. instance_device:node_disk_io_time_weighted_seconds:rate1m with the following metrics:
1. node_disk_io_time_weighted_seconds with the following labels:
1. job
1. device
1. instance:node_network_receive_bytes_excluding_lo:rate1m with the following metrics:
1. node_network_receive_bytes_total with the following labels:
1. job
1. device
1. instance:node_network_transmit_bytes_excluding_lo:rate1m with the following metrics:
1. node_network_transmit_bytes_total with the following labels:
1. job
1. device
1. instance:node_network_receive_drop_excluding_lo:rate1m with the following metrics:
1. node_network_receive_drop_total with the following labels:
1. job
1. device
1. instance:node_network_transmit_drop_excluding_lo:rate1m with the following metrics:
1. node_network_transmit_drop_total with the following labels:
1. job
1. device

## Re-introducing dropped metrics/labels

If you need to add some metrics/labels back for some particular use case, comment out every `metricRelabelings:` section (except the one from the `prometheus-operator.yaml` file), determine which metrics/labels you want to add back (i.e. by using the explorer from Grafana) and then re-edit the appropriate `metricRelabelings:` sections to account for the un-dropped metrics/labels.

## Prometheus Analysis

### Go Pprof

To analyse the memory allocations of prometheus, run `kubectl port-forward prometheus-prometheus-0 9090:9090`, and then run `go tool pprof -symbolize=remote -inuse_space localhost:9090/debug/pprof/heap`. Once you get the interpreter, you can run `top` or `dot` for a more detailed hierarchy of the memory usage.

### TSDB

To analyse the TSDB of prometheus, exec into the `prometheus-prometheus-0` pod, `cd` into `/tmp`, and run the following code-block:

```bash
wget https://github.com/prometheus/prometheus/releases/download/v1.7.3/prometheus-1.7.3.linux-amd64.tar.gz
tar -xzf prometheus-*
cd prometheus-*
./tsdb analyze /prometheus | less
```

*Useful link: https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality*

Or you can go to `localhost:9090` -> `Status` -> `TSDB Status`, but it's not as complete as running a binary analysis.
5 changes: 4 additions & 1 deletion docs/clusters/management/create.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,14 @@ tags: # <string>: <string> map of key/value pairs
# SSL certificate ARN (only necessary when using a custom domain)
ssl_certificate_arn:

# List of IAM policies to attach to your Cortex APIs
# list of IAM policies to attach to your Cortex APIs
iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]

# primary CIDR block for the cluster's VPC
vpc_cidr: 192.168.0.0/16

# instance type for prometheus (use a larger instance for clusters exceeding 500 nodes)
prometheus_instance_type: "t3.medium"
```

The docker images used by the cluster can also be overridden. They can be configured by adding any of these keys to your cluster configuration file (default values are shown):
Expand Down
27 changes: 23 additions & 4 deletions manager/generate_eks.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ def generate_eks(
cluster_config = yaml.safe_load(cluster_config_file)
region = cluster_config["region"]
name = cluster_config["cluster_name"]
prometheus_instance_type = cluster_config["prometheus_instance_type"]
ami_map = json.load(ami_json_file)[K8S_VERSION][region]

eks = {
Expand Down Expand Up @@ -309,16 +310,34 @@ def generate_eks(
"ami": get_ami(ami_map, "t3.medium"),
"name": "cx-operator",
"instanceType": "t3.medium",
"minSize": 2,
"maxSize": 2,
"desiredCapacity": 2,
"minSize": 1,
"maxSize": 25,
"desiredCapacity": 1,
"volumeType": "gp3",
"volumeSize": 20,
"volumeIOPS": 3000,
"volumeThroughput": 125,
"labels": {"operator": "true"},
}
operator_nodegroup = merge_override(operator_nodegroup, operator_settings)

prometheus_nodegroup = default_nodegroup(cluster_config)
prometheus_settings = {
"ami": get_ami(ami_map, prometheus_instance_type),
"name": "cx-prometheus",
"instanceType": prometheus_instance_type,
"minSize": 1,
"maxSize": 1,
"desiredCapacity": 1,
"volumeType": "gp3",
"volumeSize": 20,
"volumeIOPS": 3000,
"volumeThroughput": 125,
"labels": {"prometheus": "true"},
"taints": {"prometheus": "true:NoSchedule"},
}
prometheus_nodegroup = merge_override(prometheus_nodegroup, prometheus_settings)

worker_nodegroups = get_all_worker_nodegroups(ami_map, cluster_config)

nat_gateway = "Disable"
Expand All @@ -337,7 +356,7 @@ def generate_eks(
"tags": cluster_config["tags"],
},
"vpc": {"nat": {"gateway": nat_gateway}},
"nodeGroups": [operator_nodegroup] + worker_nodegroups,
"nodeGroups": [operator_nodegroup, prometheus_nodegroup] + worker_nodegroups,
"addons": [
{
"name": "vpc-cni",
Expand Down
Loading