Skip to content

Commit d21d50d

Browse files
authored
Improve prometheus scalability, reduce labels/metrics and create an additional node group for prometheus only (#2307)
1 parent d3aeb5e commit d21d50d

28 files changed

+594
-100
lines changed

cli/cmd/cluster.go

+18-12
Original file line numberDiff line numberDiff line change
@@ -770,11 +770,12 @@ func cmdInfo(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, st
770770
infoInterface = infoResponse.ClusterConfig.Config
771771
} else {
772772
infoInterface = map[string]interface{}{
773-
"cluster_config": infoResponse.ClusterConfig.Config,
774-
"cluster_metadata": infoResponse.ClusterConfig.OperatorMetadata,
775-
"node_infos": infoResponse.NodeInfos,
776-
"endpoint_operator": operatorEndpoint,
777-
"endpoint_api": apiEndpoint,
773+
"cluster_config": infoResponse.ClusterConfig.Config,
774+
"cluster_metadata": infoResponse.ClusterConfig.OperatorMetadata,
775+
"worker_node_infos": infoResponse.WorkerNodeInfos,
776+
"operator_node_infos": infoResponse.OperatorNodeInfos,
777+
"endpoint_operator": operatorEndpoint,
778+
"endpoint_api": apiEndpoint,
778779
}
779780
}
780781

@@ -848,6 +849,8 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
848849
eksPrice := aws.EKSPrices[clusterConfig.Region]
849850
operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
850851
operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
852+
prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
853+
prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
851854
metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
852855
nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
853856
natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
@@ -891,17 +894,20 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
891894
totalNodeGroupsPrice += totalEBSPrice + totalInstancePrice
892895
}
893896

897+
operatorNodeGroupPrice := float64(len(infoResponse.OperatorNodeInfos)) * (operatorInstancePrice + operatorEBSPrice)
898+
prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
899+
894900
var natTotalPrice float64
895901
if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
896902
natTotalPrice = natUnitPrice
897903
} else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway {
898904
natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones))
899905
}
900-
totalPrice := eksPrice + totalNodeGroupsPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + nlbPrice*2 + natTotalPrice
906+
totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice
901907
fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))
902908

903-
operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
904-
rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
909+
rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
910+
rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
905911
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})
906912

907913
if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
@@ -919,11 +925,11 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
919925
}
920926

921927
func printInfoNodes(infoResponse *schema.InfoResponse) {
922-
numAPIInstances := len(infoResponse.NodeInfos)
928+
numAPIInstances := len(infoResponse.WorkerNodeInfos)
923929

924930
var totalReplicas int
925931
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
926-
for _, nodeInfo := range infoResponse.NodeInfos {
932+
for _, nodeInfo := range infoResponse.WorkerNodeInfos {
927933
totalReplicas += nodeInfo.NumReplicas
928934
if nodeInfo.ComputeUserCapacity.GPU > 0 {
929935
doesClusterHaveGPUs = true
@@ -946,7 +952,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
946952

947953
fmt.Printf(console.Bold("\nyour cluster has %d API %s running across %d %s%s\n"), totalReplicas, s.PluralS("replica", totalReplicas), numAPIInstances, s.PluralS("instance", numAPIInstances), pendingReplicasStr)
948954

949-
if len(infoResponse.NodeInfos) == 0 {
955+
if len(infoResponse.WorkerNodeInfos) == 0 {
950956
return
951957
}
952958

@@ -963,7 +969,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
963969
}
964970

965971
var rows [][]interface{}
966-
for _, nodeInfo := range infoResponse.NodeInfos {
972+
for _, nodeInfo := range infoResponse.WorkerNodeInfos {
967973
lifecycle := "on-demand"
968974
if nodeInfo.IsSpot {
969975
lifecycle = "spot"

cli/cmd/lib_cluster_config.go

+7-3
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,9 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
163163
func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) {
164164
eksPrice := aws.EKSPrices[clusterConfig.Region]
165165
operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
166+
prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
166167
operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
168+
prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
167169
metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
168170
nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
169171
natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
@@ -184,7 +186,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
184186
rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})
185187

186188
ngNameToSpotInstancesUsed := map[string]int{}
187-
fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + 2*nlbPrice + natTotalPrice
189+
fixedPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
188190
totalMinPrice := fixedPrice
189191
totalMaxPrice := fixedPrice
190192
for _, ng := range clusterConfig.NodeGroups {
@@ -223,8 +225,10 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
223225
rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr})
224226
}
225227

226-
operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
227-
rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
228+
operatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice
229+
prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
230+
rows = append(rows, []interface{}{"1 t3.medium instance (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
231+
rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
228232
rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
229233

230234
if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {

dev/prometheus.md

+211
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
# Metrics
2+
3+
## Updating metrics
4+
5+
When new metrics/labels/exporters are added to be scraped by prometheus, make sure the following list **is updated** as well to keep track of what metrics/labels are needed or not.
6+
7+
The following is a list of metrics that are currently in use.
8+
9+
#### Cortex metrics
10+
11+
1. cortex_in_flight_requests with the following labels:
12+
1. api_name
13+
1. cortex_async_request_count with the following labels:
14+
1. api_name
15+
1. api_kind
16+
1. status_code
17+
1. cortex_async_queue_length with the following labels:
18+
1. api_name
19+
1. api_kind
20+
1. cortex_async_latency_bucket with the following labels:
21+
1. api_name
22+
1. api_kind
23+
1. cortex_batch_succeeded with the following labels:
24+
1. api_name
25+
1. cortex_batch_failed with the following labels:
26+
1. api_name
27+
1. cortex_time_per_batch_sum with the following labels:
28+
1. api_name
29+
1. cortex_time_per_batch_count with the following labels:
30+
1. api_name
31+
32+
#### Istio metrics
33+
34+
1. istio_requests_total with the following labels:
35+
1. destination_service
36+
1. response_code
37+
1. istio_request_duration_milliseconds_bucket with the following labels:
38+
1. destination_service
39+
1. le
40+
1. istio_request_duration_milliseconds_sum with the following labels:
41+
1. destination_service
42+
1. istio_request_duration_milliseconds_count with the following labels:
43+
1. destination_service
44+
45+
#### Kubelet metrics
46+
1. container_cpu_usage_seconds_total with the following labels:
47+
1. pod
48+
1. container
49+
1. name
50+
1. container_memory_working_set_bytes with the following labels:
51+
1. pod
52+
1. name
53+
1. container
54+
55+
#### Kube-state-metrics metrics
56+
57+
1. kube_pod_container_resource_requests with the following labels:
58+
1. exported_pod
59+
1. resource
60+
1. exported_container (required for not dropping the values for each container of each pod)
61+
1. kube_pod_info with the following labels:
62+
1. exported_pod
63+
1. kube_deployment_status_replicas_available with the following labels:
64+
1. deployment
65+
1. kube_job_status_active with the following labels:
66+
1. job_name
67+
68+
#### DCGM metrics
69+
70+
1. DCGM_FI_DEV_GPU_UTIL with the following labels:
71+
1. exported_pod
72+
1. DCGM_FI_DEV_FB_USED with the following labels:
73+
1. exported_pod
74+
1. DCGM_FI_DEV_FB_FREE with the following labels:
75+
1. exported_pod
76+
77+
#### Node metrics
78+
79+
1. node_cpu_seconds_total with the following labels:
80+
1. job
81+
1. mode
82+
1. instance
83+
1. cpu
84+
1. node_load1 with the following labels:
85+
1. job
86+
1. instance
87+
1. node_load5 with the following labels:
88+
1. job
89+
1. instance
90+
1. node_load15 with the following labels:
91+
1. job
92+
1. instance
93+
1. node_exporter_build_info with the following labels:
94+
1. job
95+
1. instance
96+
1. node_memory_MemTotal_bytes with the following labels:
97+
1. job
98+
1. instance
99+
1. node_memory_MemFree_bytes with the following labels:
100+
1. job
101+
1. instance
102+
1. node_memory_Buffers_bytes with the following labels:
103+
1. job
104+
1. instance
105+
1. node_memory_Cached_bytes with the following labels:
106+
1. job
107+
1. instance
108+
1. node_memory_MemAvailable_bytes with the following labels:
109+
1. job
110+
1. instance
111+
1. node_disk_read_bytes_total with the following labels:
112+
1. job
113+
1. instance
114+
1. device
115+
1. node_disk_written_bytes_total with the following labels:
116+
1. job
117+
1. instance
118+
1. device
119+
1. node_disk_io_time_seconds_total with the following labels:
120+
1. job
121+
1. instance
122+
1. device
123+
1. node_filesystem_size_bytes with the following labels:
124+
1. job
125+
1. instance
126+
1. fstype
127+
1. mountpoint
128+
1. device
129+
1. node_filesystem_avail_bytes with the following labels:
130+
1. job
131+
1. instance
132+
1. fstype
133+
1. device
134+
1. node_network_receive_bytes_total with the following labels:
135+
1. job
136+
1. instance
137+
1. device
138+
1. node_network_transmit_bytes_total with the following labels:
139+
1. job
140+
1. instance
141+
1. device
142+
143+
##### Prometheus rules for the node exporter
144+
145+
1. instance:node_cpu_utilisation:rate1m from the following metrics:
146+
1. node_cpu_seconds_total with the following labels:
147+
1. job
148+
1. mode
149+
1. instance:node_num_cpu:sum from the following metrics:
150+
1. node_cpu_seconds_total with the following labels:
151+
1. job
152+
1. instance:node_load1_per_cpu:ratio from the following metrics:
153+
1. node_load1 with the following labels:
154+
1. job
155+
1. instance:node_memory_utilisation:ratio from the following metrics:
156+
1. node_memory_MemTotal_bytes with the following labels:
157+
1. job
158+
1. node_memory_MemAvailable_bytes with the following labels:
159+
1. job
160+
1. instance:node_vmstat_pgmajfault:rate1m with the following metrics:
161+
1. node_vmstat_pgmajfault with the following labels:
162+
1. job
163+
1. instance_device:node_disk_io_time_seconds:rate1m with the following metrics:
164+
1. node_disk_io_time_seconds_total with the following labels:
165+
1. job
166+
1. device
167+
1. instance_device:node_disk_io_time_weighted_seconds:rate1m with the following metrics:
168+
1. node_disk_io_time_weighted_seconds with the following labels:
169+
1. job
170+
1. device
171+
1. instance:node_network_receive_bytes_excluding_lo:rate1m with the following metrics:
172+
1. node_network_receive_bytes_total with the following labels:
173+
1. job
174+
1. device
175+
1. instance:node_network_transmit_bytes_excluding_lo:rate1m with the following metrics:
176+
1. node_network_transmit_bytes_total with the following labels:
177+
1. job
178+
1. device
179+
1. instance:node_network_receive_drop_excluding_lo:rate1m with the following metrics:
180+
1. node_network_receive_drop_total with the following labels:
181+
1. job
182+
1. device
183+
1. instance:node_network_transmit_drop_excluding_lo:rate1m with the following metrics:
184+
1. node_network_transmit_drop_total with the following labels:
185+
1. job
186+
1. device
187+
188+
## Re-introducing dropped metrics/labels
189+
190+
If you need to add some metrics/labels back for some particular use case, comment out every `metricRelabelings:` section (except the one from the `prometheus-operator.yaml` file), determine which metrics/labels you want to add back (i.e. by using the explorer from Grafana) and then re-edit the appropriate `metricRelabelings:` sections to account for the un-dropped metrics/labels.
191+
192+
## Prometheus Analysis
193+
194+
### Go Pprof
195+
196+
To analyse the memory allocations of prometheus, run `kubectl port-forward prometheus-prometheus-0 9090:9090`, and then run `go tool pprof -symbolize=remote -inuse_space localhost:9090/debug/pprof/heap`. Once you get the interpreter, you can run `top` or `dot` for a more detailed hierarchy of the memory usage.
197+
198+
### TSDB
199+
200+
To analyse the TSDB of prometheus, exec into the `prometheus-prometheus-0` pod, `cd` into `/tmp`, and run the following code-block:
201+
202+
```bash
203+
wget https://github.com/prometheus/prometheus/releases/download/v1.7.3/prometheus-1.7.3.linux-amd64.tar.gz
204+
tar -xzf prometheus-*
205+
cd prometheus-*
206+
./tsdb analyze /prometheus | less
207+
```
208+
209+
*Useful link: https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality*
210+
211+
Or you can go to `localhost:9090` -> `Status` -> `TSDB Status`, but it's not as complete as running a binary analysis.

docs/clusters/management/create.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,14 @@ tags: # <string>: <string> map of key/value pairs
8686
# SSL certificate ARN (only necessary when using a custom domain)
8787
ssl_certificate_arn:
8888

89-
# List of IAM policies to attach to your Cortex APIs
89+
# list of IAM policies to attach to your Cortex APIs
9090
iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
9191

9292
# primary CIDR block for the cluster's VPC
9393
vpc_cidr: 192.168.0.0/16
94+
95+
# instance type for prometheus (use a larger instance for clusters exceeding 500 nodes)
96+
prometheus_instance_type: "t3.medium"
9497
```
9598
9699
The docker images used by the cluster can also be overridden. They can be configured by adding any of these keys to your cluster configuration file (default values are shown):

manager/generate_eks.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ def generate_eks(
273273
cluster_config = yaml.safe_load(cluster_config_file)
274274
region = cluster_config["region"]
275275
name = cluster_config["cluster_name"]
276+
prometheus_instance_type = cluster_config["prometheus_instance_type"]
276277
ami_map = json.load(ami_json_file)[K8S_VERSION][region]
277278

278279
eks = {
@@ -309,16 +310,34 @@ def generate_eks(
309310
"ami": get_ami(ami_map, "t3.medium"),
310311
"name": "cx-operator",
311312
"instanceType": "t3.medium",
312-
"minSize": 2,
313-
"maxSize": 2,
314-
"desiredCapacity": 2,
313+
"minSize": 1,
314+
"maxSize": 25,
315+
"desiredCapacity": 1,
315316
"volumeType": "gp3",
316317
"volumeSize": 20,
317318
"volumeIOPS": 3000,
318319
"volumeThroughput": 125,
320+
"labels": {"operator": "true"},
319321
}
320322
operator_nodegroup = merge_override(operator_nodegroup, operator_settings)
321323

324+
prometheus_nodegroup = default_nodegroup(cluster_config)
325+
prometheus_settings = {
326+
"ami": get_ami(ami_map, prometheus_instance_type),
327+
"name": "cx-prometheus",
328+
"instanceType": prometheus_instance_type,
329+
"minSize": 1,
330+
"maxSize": 1,
331+
"desiredCapacity": 1,
332+
"volumeType": "gp3",
333+
"volumeSize": 20,
334+
"volumeIOPS": 3000,
335+
"volumeThroughput": 125,
336+
"labels": {"prometheus": "true"},
337+
"taints": {"prometheus": "true:NoSchedule"},
338+
}
339+
prometheus_nodegroup = merge_override(prometheus_nodegroup, prometheus_settings)
340+
322341
worker_nodegroups = get_all_worker_nodegroups(ami_map, cluster_config)
323342

324343
nat_gateway = "Disable"
@@ -337,7 +356,7 @@ def generate_eks(
337356
"tags": cluster_config["tags"],
338357
},
339358
"vpc": {"nat": {"gateway": nat_gateway}},
340-
"nodeGroups": [operator_nodegroup] + worker_nodegroups,
359+
"nodeGroups": [operator_nodegroup, prometheus_nodegroup] + worker_nodegroups,
341360
"addons": [
342361
{
343362
"name": "vpc-cni",

0 commit comments

Comments
 (0)