cortexlabs · RobertLucian · Jul 3, 2021 · Jun 28, 2021 · Jun 28, 2021 · Jun 28, 2021
diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
@@ -770,11 +770,12 @@ func cmdInfo(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, st
 			infoInterface = infoResponse.ClusterConfig.Config
 		} else {
 			infoInterface = map[string]interface{}{
-				"cluster_config":    infoResponse.ClusterConfig.Config,
-				"cluster_metadata":  infoResponse.ClusterConfig.OperatorMetadata,
-				"node_infos":        infoResponse.NodeInfos,
-				"endpoint_operator": operatorEndpoint,
-				"endpoint_api":      apiEndpoint,
+				"cluster_config":      infoResponse.ClusterConfig.Config,
+				"cluster_metadata":    infoResponse.ClusterConfig.OperatorMetadata,
+				"worker_node_infos":   infoResponse.WorkerNodeInfos,
+				"operator_node_infos": infoResponse.OperatorNodeInfos,
+				"endpoint_operator":   operatorEndpoint,
+				"endpoint_api":        apiEndpoint,
 			}
 		}
 
@@ -848,6 +849,8 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 	eksPrice := aws.EKSPrices[clusterConfig.Region]
 	operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
 	operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
+	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
+	prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
 	nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
 	natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
@@ -891,17 +894,20 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 		totalNodeGroupsPrice += totalEBSPrice + totalInstancePrice
 	}
 
+	operatorNodeGroupPrice := float64(len(infoResponse.OperatorNodeInfos)) * (operatorInstancePrice + operatorEBSPrice)
+	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
+
 	var natTotalPrice float64
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
 		natTotalPrice = natUnitPrice
 	} else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway {
 		natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones))
 	}
-	totalPrice := eksPrice + totalNodeGroupsPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + nlbPrice*2 + natTotalPrice
+	totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice
 	fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))
 
-	operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
-	rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
@@ -919,11 +925,11 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 }
 
 func printInfoNodes(infoResponse *schema.InfoResponse) {
-	numAPIInstances := len(infoResponse.NodeInfos)
+	numAPIInstances := len(infoResponse.WorkerNodeInfos)
 
 	var totalReplicas int
 	var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
-	for _, nodeInfo := range infoResponse.NodeInfos {
+	for _, nodeInfo := range infoResponse.WorkerNodeInfos {
 		totalReplicas += nodeInfo.NumReplicas
 		if nodeInfo.ComputeUserCapacity.GPU > 0 {
 			doesClusterHaveGPUs = true
@@ -946,7 +952,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 
 	fmt.Printf(console.Bold("\nyour cluster has %d API %s running across %d %s%s\n"), totalReplicas, s.PluralS("replica", totalReplicas), numAPIInstances, s.PluralS("instance", numAPIInstances), pendingReplicasStr)
 
-	if len(infoResponse.NodeInfos) == 0 {
+	if len(infoResponse.WorkerNodeInfos) == 0 {
 		return
 	}
 
@@ -963,7 +969,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 	}
 
 	var rows [][]interface{}
-	for _, nodeInfo := range infoResponse.NodeInfos {
+	for _, nodeInfo := range infoResponse.WorkerNodeInfos {
 		lifecycle := "on-demand"
 		if nodeInfo.IsSpot {
 			lifecycle = "spot"

diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
@@ -163,7 +163,9 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
 func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) {
 	eksPrice := aws.EKSPrices[clusterConfig.Region]
 	operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
+	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
 	operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
+	prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
 	nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
 	natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
@@ -184,7 +186,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 	rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})
 
 	ngNameToSpotInstancesUsed := map[string]int{}
-	fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + 2*nlbPrice + natTotalPrice
+	fixedPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
 	totalMinPrice := fixedPrice
 	totalMaxPrice := fixedPrice
 	for _, ng := range clusterConfig.NodeGroups {
@@ -223,8 +225,10 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 		rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr})
 	}
 
-	operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
-	rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
+	operatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice
+	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
+	rows = append(rows, []interface{}{"1 t3.medium instance (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {

diff --git a/dev/prometheus.md b/dev/prometheus.md
@@ -0,0 +1,211 @@
+# Metrics
+
+## Updating metrics
+
+When new metrics/labels/exporters are added to be scraped by prometheus, make sure the following list **is updated** as well to keep track of what metrics/labels are needed or not.
+
+The following is a list of metrics that are currently in use.
+
+#### Cortex metrics
+
+1. cortex_in_flight_requests with the following labels:
+    1. api_name
+1. cortex_async_request_count with the following labels:
+    1. api_name
+    1. api_kind
+    1. status_code
+1. cortex_async_queue_length with the following labels:
+    1. api_name
+    1. api_kind
+1. cortex_async_latency_bucket with the following labels:
+    1. api_name
+    1. api_kind
+1. cortex_batch_succeeded with the following labels:
+    1. api_name
+1. cortex_batch_failed with the following labels:
+    1. api_name
+1. cortex_time_per_batch_sum with the following labels:
+    1. api_name
+1. cortex_time_per_batch_count with the following labels:
+    1. api_name
+
+#### Istio metrics
+
+1. istio_requests_total with the following labels:
+    1. destination_service
+    1. response_code
+1. istio_request_duration_milliseconds_bucket with the following labels:
+    1. destination_service
+    1. le
+1. istio_request_duration_milliseconds_sum with the following labels:
+    1. destination_service
+1. istio_request_duration_milliseconds_count with the following labels:
+    1. destination_service
+
+#### Kubelet metrics
+1. container_cpu_usage_seconds_total with the following labels:
+    1. pod
+    1. container
+    1. name
+1. container_memory_working_set_bytes with the following labels:
+    1. pod
+    1. name
+    1. container
+
+#### Kube-state-metrics metrics
+
+1. kube_pod_container_resource_requests with the following labels:
+    1. exported_pod
+    1. resource
+    1. exported_container (required for not dropping the values for each container of each pod)
+1. kube_pod_info with the following labels:
+    1. exported_pod
+1. kube_deployment_status_replicas_available with the following labels:
+    1. deployment
+1. kube_job_status_active with the following labels:
+    1. job_name
+
+#### DCGM metrics
+
+1. DCGM_FI_DEV_GPU_UTIL with the following labels:
+    1. exported_pod
+1. DCGM_FI_DEV_FB_USED with the following labels:
+    1. exported_pod
+1. DCGM_FI_DEV_FB_FREE with the following labels:
+    1. exported_pod
+
+#### Node metrics
+
+1. node_cpu_seconds_total with the following labels:
+    1. job
+    1. mode
+    1. instance
+    1. cpu
+1. node_load1 with the following labels:
+    1. job
+    1. instance
+1. node_load5 with the following labels:
+    1. job
+    1. instance
+1. node_load15 with the following labels:
+    1. job
+    1. instance
+1. node_exporter_build_info with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemTotal_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemFree_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_Buffers_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_Cached_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemAvailable_bytes with the following labels:
+    1. job
+    1. instance
+1. node_disk_read_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_disk_written_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_disk_io_time_seconds_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_filesystem_size_bytes with the following labels:
+    1. job
+    1. instance
+    1. fstype
+    1. mountpoint
+    1. device
+1. node_filesystem_avail_bytes with the following labels:
+    1. job
+    1. instance
+    1. fstype
+    1. device
+1. node_network_receive_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_network_transmit_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+
+##### Prometheus rules for the node exporter
+
+1. instance:node_cpu_utilisation:rate1m from the following metrics:
+    1. node_cpu_seconds_total with the following labels:
+        1. job
+        1. mode
+1. instance:node_num_cpu:sum from the following metrics:
+    1. node_cpu_seconds_total with the following labels:
+        1. job
+1. instance:node_load1_per_cpu:ratio from the following metrics:
+    1. node_load1 with the following labels:
+        1. job
+1. instance:node_memory_utilisation:ratio from the following metrics:
+    1. node_memory_MemTotal_bytes with the following labels:
+        1. job
+    1. node_memory_MemAvailable_bytes with the following labels:
+        1. job
+1. instance:node_vmstat_pgmajfault:rate1m with the following metrics:
+    1. node_vmstat_pgmajfault with the following labels:
+        1. job
+1. instance_device:node_disk_io_time_seconds:rate1m with the following metrics:
+    1. node_disk_io_time_seconds_total with the following labels:
+        1. job
+        1. device
+1. instance_device:node_disk_io_time_weighted_seconds:rate1m with the following metrics:
+    1. node_disk_io_time_weighted_seconds with the following labels:
+        1. job
+        1. device
+1. instance:node_network_receive_bytes_excluding_lo:rate1m with the following metrics:
+    1. node_network_receive_bytes_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_transmit_bytes_excluding_lo:rate1m with the following metrics:
+    1. node_network_transmit_bytes_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_receive_drop_excluding_lo:rate1m with the following metrics:
+    1. node_network_receive_drop_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_transmit_drop_excluding_lo:rate1m with the following metrics:
+    1. node_network_transmit_drop_total with the following labels:
+        1. job
+        1. device
+
+## Re-introducing dropped metrics/labels
+
+If you need to add some metrics/labels back for some particular use case, comment out every `metricRelabelings:` section (except the one from the `prometheus-operator.yaml` file), determine which metrics/labels you want to add back (i.e. by using the explorer from Grafana) and then re-edit the appropriate `metricRelabelings:` sections to account for the un-dropped metrics/labels.
+
+## Prometheus Analysis
+
+### Go Pprof
+
+To analyse the memory allocations of prometheus, run `kubectl port-forward prometheus-prometheus-0 9090:9090`, and then run `go tool pprof -symbolize=remote -inuse_space localhost:9090/debug/pprof/heap`. Once you get the interpreter, you can run `top` or `dot` for a more detailed hierarchy of the memory usage.
+
+### TSDB
+
+To analyse the TSDB of prometheus, exec into the `prometheus-prometheus-0` pod, `cd` into `/tmp`, and run the following code-block:
+
+```bash
+wget https://github.com/prometheus/prometheus/releases/download/v1.7.3/prometheus-1.7.3.linux-amd64.tar.gz
+tar -xzf prometheus-*
+cd prometheus-*
+./tsdb analyze /prometheus | less
+```
+
+*Useful link: https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality*
+
+Or you can go to `localhost:9090` -> `Status` -> `TSDB Status`, but it's not as complete as running a binary analysis.
diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md
@@ -86,11 +86,14 @@ tags:  # <string>: <string> map of key/value pairs
 # SSL certificate ARN (only necessary when using a custom domain)
 ssl_certificate_arn:
 
-# List of IAM policies to attach to your Cortex APIs
+# list of IAM policies to attach to your Cortex APIs
 iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
 
 # primary CIDR block for the cluster's VPC
 vpc_cidr: 192.168.0.0/16
+
+# instance type for prometheus (use a larger instance for clusters exceeding 500 nodes)
+prometheus_instance_type: "t3.medium"
 ```
 
 The docker images used by the cluster can also be overridden. They can be configured by adding any of these keys to your cluster configuration file (default values are shown):

diff --git a/manager/generate_eks.py b/manager/generate_eks.py
@@ -273,6 +273,7 @@ def generate_eks(
     cluster_config = yaml.safe_load(cluster_config_file)
     region = cluster_config["region"]
     name = cluster_config["cluster_name"]
+    prometheus_instance_type = cluster_config["prometheus_instance_type"]
     ami_map = json.load(ami_json_file)[K8S_VERSION][region]
 
     eks = {
@@ -309,16 +310,34 @@ def generate_eks(
         "ami": get_ami(ami_map, "t3.medium"),
         "name": "cx-operator",
         "instanceType": "t3.medium",
-        "minSize": 2,
-        "maxSize": 2,
-        "desiredCapacity": 2,
+        "minSize": 1,
+        "maxSize": 25,
+        "desiredCapacity": 1,
         "volumeType": "gp3",
         "volumeSize": 20,
         "volumeIOPS": 3000,
         "volumeThroughput": 125,
+        "labels": {"operator": "true"},
     }
     operator_nodegroup = merge_override(operator_nodegroup, operator_settings)
 
+    prometheus_nodegroup = default_nodegroup(cluster_config)
+    prometheus_settings = {
+        "ami": get_ami(ami_map, prometheus_instance_type),
+        "name": "cx-prometheus",
+        "instanceType": prometheus_instance_type,
+        "minSize": 1,
+        "maxSize": 1,
+        "desiredCapacity": 1,
+        "volumeType": "gp3",
+        "volumeSize": 20,
+        "volumeIOPS": 3000,
+        "volumeThroughput": 125,
+        "labels": {"prometheus": "true"},
+        "taints": {"prometheus": "true:NoSchedule"},
+    }
+    prometheus_nodegroup = merge_override(prometheus_nodegroup, prometheus_settings)
+
     worker_nodegroups = get_all_worker_nodegroups(ami_map, cluster_config)
 
     nat_gateway = "Disable"
@@ -337,7 +356,7 @@ def generate_eks(
             "tags": cluster_config["tags"],
         },
         "vpc": {"nat": {"gateway": nat_gateway}},
-        "nodeGroups": [operator_nodegroup] + worker_nodegroups,
+        "nodeGroups": [operator_nodegroup, prometheus_nodegroup] + worker_nodegroups,
         "addons": [
             {
                 "name": "vpc-cni",