diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 1616565b71..375f1ca0d8 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -770,11 +770,12 @@ func cmdInfo(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, st infoInterface = infoResponse.ClusterConfig.Config } else { infoInterface = map[string]interface{}{ - "cluster_config": infoResponse.ClusterConfig.Config, - "cluster_metadata": infoResponse.ClusterConfig.OperatorMetadata, - "node_infos": infoResponse.NodeInfos, - "endpoint_operator": operatorEndpoint, - "endpoint_api": apiEndpoint, + "cluster_config": infoResponse.ClusterConfig.Config, + "cluster_metadata": infoResponse.ClusterConfig.OperatorMetadata, + "worker_node_infos": infoResponse.WorkerNodeInfos, + "operator_node_infos": infoResponse.OperatorNodeInfos, + "endpoint_operator": operatorEndpoint, + "endpoint_api": apiEndpoint, } } @@ -848,6 +849,8 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco eksPrice := aws.EKSPrices[clusterConfig.Region] operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 + prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price + prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24 nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price @@ -891,17 +894,20 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco totalNodeGroupsPrice += totalEBSPrice + totalInstancePrice } + operatorNodeGroupPrice := float64(len(infoResponse.OperatorNodeInfos)) * (operatorInstancePrice + operatorEBSPrice) + prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + var natTotalPrice float64 if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { natTotalPrice = natUnitPrice } else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway { natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones)) } - totalPrice := eksPrice + totalNodeGroupsPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + nlbPrice*2 + natTotalPrice + totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice)) - operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice - rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)}) + rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)}) + rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"}) if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { @@ -919,11 +925,11 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco } func printInfoNodes(infoResponse *schema.InfoResponse) { - numAPIInstances := len(infoResponse.NodeInfos) + numAPIInstances := len(infoResponse.WorkerNodeInfos) var totalReplicas int var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool - for _, nodeInfo := range infoResponse.NodeInfos { + for _, nodeInfo := range infoResponse.WorkerNodeInfos { totalReplicas += nodeInfo.NumReplicas if nodeInfo.ComputeUserCapacity.GPU > 0 { doesClusterHaveGPUs = true @@ -946,7 +952,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) { fmt.Printf(console.Bold("\nyour cluster has %d API %s running across %d %s%s\n"), totalReplicas, s.PluralS("replica", totalReplicas), numAPIInstances, s.PluralS("instance", numAPIInstances), pendingReplicasStr) - if len(infoResponse.NodeInfos) == 0 { + if len(infoResponse.WorkerNodeInfos) == 0 { return } @@ -963,7 +969,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) { } var rows [][]interface{} - for _, nodeInfo := range infoResponse.NodeInfos { + for _, nodeInfo := range infoResponse.WorkerNodeInfos { lifecycle := "on-demand" if nodeInfo.IsSpot { lifecycle = "spot" diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go index edd6cd7732..1da6de2f7a 100644 --- a/cli/cmd/lib_cluster_config.go +++ b/cli/cmd/lib_cluster_config.go @@ -163,7 +163,9 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) { eksPrice := aws.EKSPrices[clusterConfig.Region] operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price + prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 + prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24 nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price @@ -184,7 +186,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)}) ngNameToSpotInstancesUsed := map[string]int{} - fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + 2*nlbPrice + natTotalPrice + fixedPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice totalMinPrice := fixedPrice totalMaxPrice := fixedPrice for _, ng := range clusterConfig.NodeGroups { @@ -223,8 +225,10 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr}) } - operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice - rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)}) + operatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice + prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + rows = append(rows, []interface{}{"1 t3.medium instance (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)}) + rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"}) if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { diff --git a/dev/prometheus.md b/dev/prometheus.md new file mode 100644 index 0000000000..1cc801c37f --- /dev/null +++ b/dev/prometheus.md @@ -0,0 +1,211 @@ +# Metrics + +## Updating metrics + +When new metrics/labels/exporters are added to be scraped by prometheus, make sure the following list **is updated** as well to keep track of what metrics/labels are needed or not. + +The following is a list of metrics that are currently in use. + +#### Cortex metrics + +1. cortex_in_flight_requests with the following labels: + 1. api_name +1. cortex_async_request_count with the following labels: + 1. api_name + 1. api_kind + 1. status_code +1. cortex_async_queue_length with the following labels: + 1. api_name + 1. api_kind +1. cortex_async_latency_bucket with the following labels: + 1. api_name + 1. api_kind +1. cortex_batch_succeeded with the following labels: + 1. api_name +1. cortex_batch_failed with the following labels: + 1. api_name +1. cortex_time_per_batch_sum with the following labels: + 1. api_name +1. cortex_time_per_batch_count with the following labels: + 1. api_name + +#### Istio metrics + +1. istio_requests_total with the following labels: + 1. destination_service + 1. response_code +1. istio_request_duration_milliseconds_bucket with the following labels: + 1. destination_service + 1. le +1. istio_request_duration_milliseconds_sum with the following labels: + 1. destination_service +1. istio_request_duration_milliseconds_count with the following labels: + 1. destination_service + +#### Kubelet metrics +1. container_cpu_usage_seconds_total with the following labels: + 1. pod + 1. container + 1. name +1. container_memory_working_set_bytes with the following labels: + 1. pod + 1. name + 1. container + +#### Kube-state-metrics metrics + +1. kube_pod_container_resource_requests with the following labels: + 1. exported_pod + 1. resource + 1. exported_container (required for not dropping the values for each container of each pod) +1. kube_pod_info with the following labels: + 1. exported_pod +1. kube_deployment_status_replicas_available with the following labels: + 1. deployment +1. kube_job_status_active with the following labels: + 1. job_name + +#### DCGM metrics + +1. DCGM_FI_DEV_GPU_UTIL with the following labels: + 1. exported_pod +1. DCGM_FI_DEV_FB_USED with the following labels: + 1. exported_pod +1. DCGM_FI_DEV_FB_FREE with the following labels: + 1. exported_pod + +#### Node metrics + +1. node_cpu_seconds_total with the following labels: + 1. job + 1. mode + 1. instance + 1. cpu +1. node_load1 with the following labels: + 1. job + 1. instance +1. node_load5 with the following labels: + 1. job + 1. instance +1. node_load15 with the following labels: + 1. job + 1. instance +1. node_exporter_build_info with the following labels: + 1. job + 1. instance +1. node_memory_MemTotal_bytes with the following labels: + 1. job + 1. instance +1. node_memory_MemFree_bytes with the following labels: + 1. job + 1. instance +1. node_memory_Buffers_bytes with the following labels: + 1. job + 1. instance +1. node_memory_Cached_bytes with the following labels: + 1. job + 1. instance +1. node_memory_MemAvailable_bytes with the following labels: + 1. job + 1. instance +1. node_disk_read_bytes_total with the following labels: + 1. job + 1. instance + 1. device +1. node_disk_written_bytes_total with the following labels: + 1. job + 1. instance + 1. device +1. node_disk_io_time_seconds_total with the following labels: + 1. job + 1. instance + 1. device +1. node_filesystem_size_bytes with the following labels: + 1. job + 1. instance + 1. fstype + 1. mountpoint + 1. device +1. node_filesystem_avail_bytes with the following labels: + 1. job + 1. instance + 1. fstype + 1. device +1. node_network_receive_bytes_total with the following labels: + 1. job + 1. instance + 1. device +1. node_network_transmit_bytes_total with the following labels: + 1. job + 1. instance + 1. device + +##### Prometheus rules for the node exporter + +1. instance:node_cpu_utilisation:rate1m from the following metrics: + 1. node_cpu_seconds_total with the following labels: + 1. job + 1. mode +1. instance:node_num_cpu:sum from the following metrics: + 1. node_cpu_seconds_total with the following labels: + 1. job +1. instance:node_load1_per_cpu:ratio from the following metrics: + 1. node_load1 with the following labels: + 1. job +1. instance:node_memory_utilisation:ratio from the following metrics: + 1. node_memory_MemTotal_bytes with the following labels: + 1. job + 1. node_memory_MemAvailable_bytes with the following labels: + 1. job +1. instance:node_vmstat_pgmajfault:rate1m with the following metrics: + 1. node_vmstat_pgmajfault with the following labels: + 1. job +1. instance_device:node_disk_io_time_seconds:rate1m with the following metrics: + 1. node_disk_io_time_seconds_total with the following labels: + 1. job + 1. device +1. instance_device:node_disk_io_time_weighted_seconds:rate1m with the following metrics: + 1. node_disk_io_time_weighted_seconds with the following labels: + 1. job + 1. device +1. instance:node_network_receive_bytes_excluding_lo:rate1m with the following metrics: + 1. node_network_receive_bytes_total with the following labels: + 1. job + 1. device +1. instance:node_network_transmit_bytes_excluding_lo:rate1m with the following metrics: + 1. node_network_transmit_bytes_total with the following labels: + 1. job + 1. device +1. instance:node_network_receive_drop_excluding_lo:rate1m with the following metrics: + 1. node_network_receive_drop_total with the following labels: + 1. job + 1. device +1. instance:node_network_transmit_drop_excluding_lo:rate1m with the following metrics: + 1. node_network_transmit_drop_total with the following labels: + 1. job + 1. device + +## Re-introducing dropped metrics/labels + +If you need to add some metrics/labels back for some particular use case, comment out every `metricRelabelings:` section (except the one from the `prometheus-operator.yaml` file), determine which metrics/labels you want to add back (i.e. by using the explorer from Grafana) and then re-edit the appropriate `metricRelabelings:` sections to account for the un-dropped metrics/labels. + +## Prometheus Analysis + +### Go Pprof + +To analyse the memory allocations of prometheus, run `kubectl port-forward prometheus-prometheus-0 9090:9090`, and then run `go tool pprof -symbolize=remote -inuse_space localhost:9090/debug/pprof/heap`. Once you get the interpreter, you can run `top` or `dot` for a more detailed hierarchy of the memory usage. + +### TSDB + +To analyse the TSDB of prometheus, exec into the `prometheus-prometheus-0` pod, `cd` into `/tmp`, and run the following code-block: + +```bash +wget https://github.com/prometheus/prometheus/releases/download/v1.7.3/prometheus-1.7.3.linux-amd64.tar.gz +tar -xzf prometheus-* +cd prometheus-* +./tsdb analyze /prometheus | less +``` + +*Useful link: https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality* + +Or you can go to `localhost:9090` -> `Status` -> `TSDB Status`, but it's not as complete as running a binary analysis. diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md index a3abecdd85..b65e2dfa84 100644 --- a/docs/clusters/management/create.md +++ b/docs/clusters/management/create.md @@ -86,11 +86,14 @@ tags: # : map of key/value pairs # SSL certificate ARN (only necessary when using a custom domain) ssl_certificate_arn: -# List of IAM policies to attach to your Cortex APIs +# list of IAM policies to attach to your Cortex APIs iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"] # primary CIDR block for the cluster's VPC vpc_cidr: 192.168.0.0/16 + +# instance type for prometheus (use a larger instance for clusters exceeding 500 nodes) +prometheus_instance_type: "t3.medium" ``` The docker images used by the cluster can also be overridden. They can be configured by adding any of these keys to your cluster configuration file (default values are shown): diff --git a/manager/generate_eks.py b/manager/generate_eks.py index 38d513f168..aa22c9a9a9 100644 --- a/manager/generate_eks.py +++ b/manager/generate_eks.py @@ -273,6 +273,7 @@ def generate_eks( cluster_config = yaml.safe_load(cluster_config_file) region = cluster_config["region"] name = cluster_config["cluster_name"] + prometheus_instance_type = cluster_config["prometheus_instance_type"] ami_map = json.load(ami_json_file)[K8S_VERSION][region] eks = { @@ -309,16 +310,34 @@ def generate_eks( "ami": get_ami(ami_map, "t3.medium"), "name": "cx-operator", "instanceType": "t3.medium", - "minSize": 2, - "maxSize": 2, - "desiredCapacity": 2, + "minSize": 1, + "maxSize": 25, + "desiredCapacity": 1, "volumeType": "gp3", "volumeSize": 20, "volumeIOPS": 3000, "volumeThroughput": 125, + "labels": {"operator": "true"}, } operator_nodegroup = merge_override(operator_nodegroup, operator_settings) + prometheus_nodegroup = default_nodegroup(cluster_config) + prometheus_settings = { + "ami": get_ami(ami_map, prometheus_instance_type), + "name": "cx-prometheus", + "instanceType": prometheus_instance_type, + "minSize": 1, + "maxSize": 1, + "desiredCapacity": 1, + "volumeType": "gp3", + "volumeSize": 20, + "volumeIOPS": 3000, + "volumeThroughput": 125, + "labels": {"prometheus": "true"}, + "taints": {"prometheus": "true:NoSchedule"}, + } + prometheus_nodegroup = merge_override(prometheus_nodegroup, prometheus_settings) + worker_nodegroups = get_all_worker_nodegroups(ami_map, cluster_config) nat_gateway = "Disable" @@ -337,7 +356,7 @@ def generate_eks( "tags": cluster_config["tags"], }, "vpc": {"nat": {"gateway": nat_gateway}}, - "nodeGroups": [operator_nodegroup] + worker_nodegroups, + "nodeGroups": [operator_nodegroup, prometheus_nodegroup] + worker_nodegroups, "addons": [ { "name": "vpc-cni", diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2 index 3c975e3890..0fd3bb918e 100644 --- a/manager/manifests/cluster-autoscaler.yaml.j2 +++ b/manager/manifests/cluster-autoscaler.yaml.j2 @@ -181,11 +181,11 @@ spec: name: cluster-autoscaler resources: limits: - cpu: 100m - memory: 300Mi + cpu: 300m + memory: 1Gi requests: cpu: 100m - memory: 300Mi + memory: 200Mi command: - ./cluster-autoscaler - --v=4 diff --git a/manager/manifests/event-exporter.yaml b/manager/manifests/event-exporter.yaml index ae0957d9aa..ab4847c4a7 100644 --- a/manager/manifests/event-exporter.yaml +++ b/manager/manifests/event-exporter.yaml @@ -82,6 +82,10 @@ spec: volumeMounts: - mountPath: /data name: event-exporter-config + resources: + requests: + cpu: 20m + memory: 50Mi volumes: - name: event-exporter-config configMap: diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2 index a860659bc7..a2e1140f2c 100644 --- a/manager/manifests/fluent-bit.yaml.j2 +++ b/manager/manifests/fluent-bit.yaml.j2 @@ -249,3 +249,6 @@ spec: - key: workload operator: Exists effect: NoSchedule + - key: prometheus + operator: Exists + effect: NoSchedule diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml index 91f10b5a13..83a5b73d32 100644 --- a/manager/manifests/grafana/grafana.yaml +++ b/manager/manifests/grafana/grafana.yaml @@ -173,6 +173,12 @@ spec: - name: grafana-dashboard-nodes configMap: name: grafana-dashboard-nodes + nodeSelector: + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule affinity: podAffinity: preferredDuringSchedulingIgnoredDuringExecution: diff --git a/manager/manifests/inferentia.yaml b/manager/manifests/inferentia.yaml index eb6a7db974..51331f6715 100644 --- a/manager/manifests/inferentia.yaml +++ b/manager/manifests/inferentia.yaml @@ -140,7 +140,7 @@ spec: value: "12345" resources: requests: - cpu: 100m + cpu: 50m memory: 100Mi --- diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2 index 9df2d198a5..5b4316ca8e 100644 --- a/manager/manifests/istio.yaml.j2 +++ b/manager/manifests/istio.yaml.j2 @@ -25,8 +25,8 @@ spec: k8s: resources: requests: - cpu: 200m # default is 500m - memory: 1.75Gi # default is 2048Mi == 2Gi + cpu: 100m # default is 500m + memory: 200Mi # default is 2048Mi == 2Gi cni: enabled: false ingressGateways: @@ -72,7 +72,7 @@ spec: cpu: 100m memory: 128Mi limits: - cpu: 2000m + cpu: 1000m memory: 1024Mi replicaCount: 1 hpaSpec: @@ -128,20 +128,24 @@ spec: targetPort: 15443 resources: requests: - cpu: 200m + cpu: 512m memory: 128Mi limits: - cpu: 2000m + cpu: 1500m memory: 1024Mi replicaCount: 1 hpaSpec: minReplicas: 1 - maxReplicas: 1 # edit autoscaleEnabled in values if increasing this + maxReplicas: 100 # edit autoscaleEnabled in values if increasing this metrics: - type: Resource resource: name: cpu - targetAverageUtilization: 80 + targetAverageUtilization: 90 + - type: Resource + resource: + name: mem + targetAverageUtilization: 90 scaleTargetRef: apiVersion: apps/v1 kind: Deployment @@ -159,7 +163,7 @@ spec: gateways: istio-ingressgateway: runAsRoot: true - autoscaleEnabled: false + autoscaleEnabled: true secretVolumes: - name: customgateway-certs secretName: istio-customgateway-certs diff --git a/manager/manifests/metrics-server.yaml b/manager/manifests/metrics-server.yaml index bdaad365d9..6e4e0de8bb 100644 --- a/manager/manifests/metrics-server.yaml +++ b/manager/manifests/metrics-server.yaml @@ -174,8 +174,11 @@ spec: periodSeconds: 10 resources: requests: - cpu: 100m - memory: 200Mi + cpu: 50m + memory: 100Mi + limits: + cpu: 200m + memory: 500Mi securityContext: readOnlyRootFilesystem: true runAsNonRoot: true diff --git a/manager/manifests/operator.yaml.j2 b/manager/manifests/operator.yaml.j2 index c5501cb1d8..3ccd8eea76 100644 --- a/manager/manifests/operator.yaml.j2 +++ b/manager/manifests/operator.yaml.j2 @@ -58,10 +58,10 @@ spec: imagePullPolicy: Always resources: requests: - cpu: 200m + cpu: 100m memory: 128Mi limits: - cpu: 2000m + cpu: 1500m memory: 1024Mi ports: - containerPort: 8888 diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml index 98ad1c0006..8b37d969dd 100644 --- a/manager/manifests/prometheus-dcgm-exporter.yaml +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -122,6 +122,16 @@ spec: path: /metrics scheme: http interval: 15s + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "DCGM_FI_DEV_(\ + GPU_UTIL|\ + FB_USED|\ + FB_FREE\ + )" + - action: labelkeep + regex: (__name__|exported_pod) namespaceSelector: any: true selector: diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index d98bfb13c9..ba5165ff6e 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -234,6 +234,12 @@ spec: port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 + nodeSelector: + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule --- apiVersion: monitoring.coreos.com/v1 kind: PodMonitor @@ -250,6 +256,17 @@ spec: scheme: http path: /metrics interval: 30s + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "kube_(\ + pod_container_resource_requests|\ + pod_info|\ + deployment_status_replicas_available|\ + job_status_active\ + )" + - action: labelkeep + regex: (__name__|exported_pod|exported_container|job_name|resource) namespaceSelector: any: true selector: diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml index 3a8ff44f3b..8982706c42 100644 --- a/manager/manifests/prometheus-kubelet-exporter.yaml +++ b/manager/manifests/prometheus-kubelet-exporter.yaml @@ -27,37 +27,7 @@ spec: interval: 30s metricRelabelings: - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) - sourceLabels: - - __name__ + sourceLabels: [__name__] port: https-metrics relabelings: - sourceLabels: @@ -71,10 +41,14 @@ spec: honorTimestamps: false interval: 30s metricRelabelings: - - action: drop - regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) - sourceLabels: - - __name__ + - action: keep + sourceLabels: [__name__] + regex: "container_(\ + cpu_usage_seconds_total|\ + memory_working_set_bytes\ + )" + - action: labelkeep + regex: (__name__|pod|container|name) path: /metrics/cadvisor port: https-metrics relabelings: @@ -93,6 +67,9 @@ spec: - sourceLabels: - __metrics_path__ targetLabel: metrics_path + metricRelabelings: + - action: drop + sourceLabels: [__name__] scheme: https tlsConfig: insecureSkipVerify: true diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index fa9aefe277..b0d76fbc34 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -30,6 +30,12 @@ metadata: spec: image: {{ config['image_prometheus'] }} serviceAccountName: prometheus + nodeSelector: + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule podMonitorSelector: matchExpressions: - key: "monitoring.cortex.dev" @@ -160,6 +166,17 @@ spec: - sourceLabels: [ __meta_kubernetes_pod_name ] action: replace targetLabel: pod_name + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "istio_(\ + requests_total|\ + request_duration_milliseconds_bucket|\ + request_duration_milliseconds_sum|\ + request_duration_milliseconds_count\ + )" + - action: labelkeep + regex: (__name__|destination_service|response_code|le) --- @@ -209,6 +226,10 @@ spec: - sourceLabels: [ __meta_kubernetes_pod_name ] action: replace targetLabel: pod_name + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "cortex_(.+)" --- @@ -259,6 +280,10 @@ spec: - sourceLabels: [ __meta_kubernetes_pod_name ] action: replace targetLabel: pod_name + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "cortex_(.+)" --- @@ -276,6 +301,10 @@ spec: scheme: http path: /metrics interval: 20s + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "cortex_(.+)" namespaceSelector: any: true selector: @@ -298,6 +327,10 @@ spec: scheme: http path: /metrics interval: 10s + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "cortex_(.+)" namespaceSelector: any: true selector: diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index d6f97170a1..9c3a483f81 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -115,7 +115,7 @@ spec: cpu: 250m memory: 180Mi requests: - cpu: 100m + cpu: 40m memory: 180Mi volumeMounts: - mountPath: /host/sys @@ -194,6 +194,34 @@ spec: sourceLabels: - __meta_kubernetes_pod_node_name targetLabel: instance + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "node_(\ + cpu_seconds_total|\ + load1|\ + load5|\ + load15|\ + exporter_build_info|\ + memory_MemTotal_bytes|\ + memory_MemFree_bytes|\ + memory_Buffers_bytes|\ + memory_Cached_bytes|\ + memory_MemAvailable_bytes|\ + disk_read_bytes_total|\ + disk_written_bytes_total|\ + disk_io_time_seconds_total|\ + disk_io_time_weighted_seconds_total|\ + filesystem_size_bytes|\ + filesystem_avail_bytes|\ + network_receive_bytes_total|\ + network_transmit_bytes_total|\ + network_receive_drop_total|\ + network_transmit_drop_total|\ + vmstat_pgmajfault\ + )" + - action: labelkeep + regex: (__name__|instance|job|device|fstype|mountpoint|mode) scheme: https tlsConfig: insecureSkipVerify: true diff --git a/manager/manifests/prometheus-operator.yaml b/manager/manifests/prometheus-operator.yaml index ddeff6e1c9..3b7b558318 100644 --- a/manager/manifests/prometheus-operator.yaml +++ b/manager/manifests/prometheus-operator.yaml @@ -14199,6 +14199,11 @@ spec: allowPrivilegeEscalation: false nodeSelector: kubernetes.io/os: linux + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manager/manifests/prometheus-statsd-exporter.yaml b/manager/manifests/prometheus-statsd-exporter.yaml index b96a700ea6..ea58db52d8 100644 --- a/manager/manifests/prometheus-statsd-exporter.yaml +++ b/manager/manifests/prometheus-statsd-exporter.yaml @@ -74,6 +74,12 @@ spec: volumeMounts: - name: statsd-mapping-config mountPath: /etc/prometheus-statsd-exporter + nodeSelector: + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule volumes: - name: statsd-mapping-config configMap: diff --git a/pkg/crds/config/manager/manager.yaml b/pkg/crds/config/manager/manager.yaml index c1a9c7d25c..48fb8ad0e1 100644 --- a/pkg/crds/config/manager/manager.yaml +++ b/pkg/crds/config/manager/manager.yaml @@ -46,11 +46,11 @@ spec: periodSeconds: 10 resources: limits: - cpu: 100m - memory: 30Mi + cpu: 300m + memory: 100Mi requests: cpu: 100m - memory: 20Mi + memory: 80Mi volumeMounts: - mountPath: /mnt/cluster.yaml name: cluster-config diff --git a/pkg/lib/aws/ec2.go b/pkg/lib/aws/ec2.go index 338a719c91..66b2f94fb6 100644 --- a/pkg/lib/aws/ec2.go +++ b/pkg/lib/aws/ec2.go @@ -137,6 +137,46 @@ func IsAMDGPUInstance(instanceType string) (bool, error) { return false, nil } +func IsNvidiaGPUInstance(instanceType string) (bool, error) { + parsedType, err := ParseInstanceType(instanceType) + if err != nil { + return false, err + } + + if !_gpuInstanceFamilies.Has(parsedType.Family) { + return false, nil + } + + if !parsedType.Capabilities.Has("a") { + return true, nil + } + + return false, nil +} + +func IsGPUInstance(instanceType string) (bool, error) { + isAMDGPU, err := IsAMDGPUInstance(instanceType) + if err != nil { + return false, err + } + + isNvidiaGPU, err := IsNvidiaGPUInstance(instanceType) + if err != nil { + return false, err + } + + return isAMDGPU || isNvidiaGPU, nil +} + +func IsInferentiaInstance(instanceType string) (bool, error) { + parsedType, err := ParseInstanceType(instanceType) + if err != nil { + return false, err + } + + return parsedType.Family == "inf", nil +} + func IsMacInstance(instanceType string) (bool, error) { parsedType, err := ParseInstanceType(instanceType) if err != nil { diff --git a/pkg/lib/aws/servicequotas.go b/pkg/lib/aws/servicequotas.go index ef99e1787b..d1bc45b867 100644 --- a/pkg/lib/aws/servicequotas.go +++ b/pkg/lib/aws/servicequotas.go @@ -328,9 +328,9 @@ func requiredRulesForNodeGroupSecurityGroup(numAZs, whitelistLength int) int { } func requiredRulesForControlPlaneSecurityGroup(numNodeGroups int) int { - // +1 for the operator node group + // +2 for the operator and prometheus node groups // this is the number of outbound rules (there are half as many inbound rules, so that is not the limiting factor) - return 2 * (numNodeGroups + 1) + return 2 * (numNodeGroups + 2) } func requiredSecurityGroups(numNodeGroups int, clusterAlreadyExists bool) int { diff --git a/pkg/operator/endpoints/info.go b/pkg/operator/endpoints/info.go index b83a6374e0..c210e74fe7 100644 --- a/pkg/operator/endpoints/info.go +++ b/pkg/operator/endpoints/info.go @@ -31,7 +31,13 @@ import ( ) func Info(w http.ResponseWriter, r *http.Request) { - nodeInfos, numPendingReplicas, err := getNodeInfos() + workerNodeInfos, numPendingReplicas, err := getWorkerNodeInfos() + if err != nil { + respondError(w, r, err) + return + } + + operatorNodeInfos, err := getOperatorNodeInfos() if err != nil { respondError(w, r, err) return @@ -44,13 +50,14 @@ func Info(w http.ResponseWriter, r *http.Request) { response := schema.InfoResponse{ ClusterConfig: fullClusterConfig, - NodeInfos: nodeInfos, + WorkerNodeInfos: workerNodeInfos, + OperatorNodeInfos: operatorNodeInfos, NumPendingReplicas: numPendingReplicas, } respondJSON(w, r, response) } -func getNodeInfos() ([]schema.NodeInfo, int, error) { +func getWorkerNodeInfos() ([]schema.WorkerNodeInfo, int, error) { pods, err := config.K8sAllNamspaces.ListPods(nil) if err != nil { return nil, 0, err @@ -61,8 +68,8 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) { return nil, 0, err } - nodeInfoMap := make(map[string]*schema.NodeInfo, len(nodes)) // node name -> info - spotPriceCache := make(map[string]float64) // instance type -> spot price + nodeInfoMap := make(map[string]*schema.WorkerNodeInfo, len(nodes)) // node name -> info + spotPriceCache := make(map[string]float64) // instance type -> spot price for i := range nodes { node := nodes[i] @@ -86,12 +93,14 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) { } } - nodeInfoMap[node.Name] = &schema.NodeInfo{ + nodeInfoMap[node.Name] = &schema.WorkerNodeInfo{ + NodeInfo: schema.NodeInfo{ + NodeGroupName: nodeGroupName, + InstanceType: instanceType, + IsSpot: isSpot, + Price: price, + }, Name: node.Name, - NodeGroupName: nodeGroupName, - InstanceType: instanceType, - IsSpot: isSpot, - Price: price, NumReplicas: 0, // will be added to below ComputeUserCapacity: nodeComputeAllocatable(&node), // will be subtracted from below ComputeAvailable: nodeComputeAllocatable(&node), // will be subtracted from below @@ -160,7 +169,7 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) { sort.Strings(nodeNames) - nodeInfos := make([]schema.NodeInfo, len(nodeNames)) + nodeInfos := make([]schema.WorkerNodeInfo, len(nodeNames)) for i, nodeName := range nodeNames { nodeInfos[i] = *nodeInfoMap[nodeName] } @@ -179,3 +188,41 @@ func nodeComputeAllocatable(node *kcore.Node) userconfig.Compute { Inf: infQty.Value(), } } + +func getOperatorNodeInfos() ([]schema.NodeInfo, error) { + nodes, err := config.K8sAllNamspaces.ListNodesByLabel("operator", "true") + if err != nil { + return nil, err + } + + nodeInfoMap := make(map[string]*schema.NodeInfo, len(nodes)) // node name -> info + + for i := range nodes { + node := nodes[i] + + instanceType := node.Labels["beta.kubernetes.io/instance-type"] + nodeGroupName := node.Labels["alpha.eksctl.io/nodegroup-name"] + + price := aws.InstanceMetadatas[config.ClusterConfig.Region][instanceType].Price + + nodeInfoMap[node.Name] = &schema.NodeInfo{ + NodeGroupName: nodeGroupName, + InstanceType: instanceType, + Price: price, + } + } + + nodeNames := make([]string, 0, len(nodeInfoMap)) + for nodeName := range nodeInfoMap { + nodeNames = append(nodeNames, nodeName) + } + + sort.Strings(nodeNames) + + nodeInfos := make([]schema.NodeInfo, len(nodeNames)) + for i, nodeName := range nodeNames { + nodeInfos[i] = *nodeInfoMap[nodeName] + } + + return nodeInfos, nil +} diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index 18988fa277..d86073e4e7 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -26,16 +26,14 @@ import ( type InfoResponse struct { ClusterConfig clusterconfig.InternalConfig `json:"cluster_config" yaml:"cluster_config"` - NodeInfos []NodeInfo `json:"node_infos" yaml:"node_infos"` + WorkerNodeInfos []WorkerNodeInfo `json:"worker_node_infos" yaml:"worker_node_infos"` + OperatorNodeInfos []NodeInfo `json:"operator_node_infos" yaml:"operator_node_infos"` NumPendingReplicas int `json:"num_pending_replicas" yaml:"num_pending_replicas"` } -type NodeInfo struct { +type WorkerNodeInfo struct { + NodeInfo Name string `json:"name" yaml:"name"` - NodeGroupName string `json:"nodegroup_name" yaml:"nodegroup_name"` - InstanceType string `json:"instance_type" yaml:"instance_type"` - IsSpot bool `json:"is_spot" yaml:"is_spot"` - Price float64 `json:"price" yaml:"price"` NumReplicas int `json:"num_replicas" yaml:"num_replicas"` NumAsyncGatewayReplicas int `json:"num_async_gateway_replicas" yaml:"num_async_gateway_replicas"` NumEnqueuerReplicas int `json:"num_enqueuer_replicas" yaml:"num_enqueuer_replicas"` @@ -44,6 +42,13 @@ type NodeInfo struct { ComputeUserRequested userconfig.Compute `json:"compute_user_requested" yaml:"compute_user_requested"` // total resources requested by user on a node } +type NodeInfo struct { + NodeGroupName string `json:"nodegroup_name" yaml:"nodegroup_name"` + InstanceType string `json:"instance_type" yaml:"instance_type"` + IsSpot bool `json:"is_spot" yaml:"is_spot"` + Price float64 `json:"price" yaml:"price"` +} + type DeployResult struct { API *APIResponse `json:"api"` Message string `json:"message"` @@ -97,9 +102,9 @@ type APIVersion struct { type VerifyCortexResponse struct{} -func (ir InfoResponse) GetNodesWithNodeGroupName(ngName string) []NodeInfo { - nodesInfo := []NodeInfo{} - for _, nodeInfo := range ir.NodeInfos { +func (ir InfoResponse) GetNodesWithNodeGroupName(ngName string) []WorkerNodeInfo { + nodesInfo := []WorkerNodeInfo{} + for _, nodeInfo := range ir.WorkerNodeInfos { if nodeInfo.NodeGroupName == ngName { nodesInfo = append(nodesInfo, nodeInfo) } diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go index 6c56ffd485..c8d0d13124 100644 --- a/pkg/types/clusterconfig/cluster_config.go +++ b/pkg/types/clusterconfig/cluster_config.go @@ -56,6 +56,8 @@ const ( ) var ( + _operatorNodeGroupInstanceType = "t3.medium" + _maxNodeGroupLengthWithPrefix = 32 _maxNodeGroupLength = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws- _maxInstancePools = 20 @@ -84,8 +86,9 @@ type CoreConfig struct { IstioNamespace string `json:"istio_namespace" yaml:"istio_namespace"` // User-specifiable fields - ClusterName string `json:"cluster_name" yaml:"cluster_name"` - Region string `json:"region" yaml:"region"` + ClusterName string `json:"cluster_name" yaml:"cluster_name"` + Region string `json:"region" yaml:"region"` + PrometheusInstanceType string `json:"prometheus_instance_type" yaml:"prometheus_instance_type"` // User-specifiable fields ImageOperator string `json:"image_operator" yaml:"image_operator"` @@ -333,6 +336,14 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{ Validator: RegionValidator, }, }, + { + StructField: "PrometheusInstanceType", + StringValidation: &cr.StringValidation{ + MinLength: 1, + Default: "t3.medium", + Validator: validatePrometheusInstanceType, + }, + }, { StructField: "Telemetry", BoolValidation: &cr.BoolValidation{ @@ -906,7 +917,16 @@ func (cc *Config) validate(awsClient *aws.Client) error { } ngNames := []string{} - instances := []aws.InstanceTypeRequests{} + instances := []aws.InstanceTypeRequests{ + { + InstanceType: _operatorNodeGroupInstanceType, + RequiredOnDemandInstances: 1, + }, + { + InstanceType: cc.PrometheusInstanceType, + RequiredOnDemandInstances: 1, + }, + } for _, nodeGroup := range cc.NodeGroups { if !slices.HasString(ngNames, nodeGroup.Name) { ngNames = append(ngNames, nodeGroup.Name) @@ -1494,6 +1514,31 @@ func validateInstanceType(instanceType string) (string, error) { return instanceType, nil } +func validatePrometheusInstanceType(instanceType string) (string, error) { + _, err := validateInstanceType(instanceType) + if err != nil { + return "", err + } + + isGPU, err := aws.IsGPUInstance(instanceType) + if err != nil { + return "", err + } + if isGPU { + return "", ErrorGPUInstancesNotSupported(instanceType) + } + + isInf, err := aws.IsInferentiaInstance(instanceType) + if err != nil { + return "", err + } + if isInf { + return "", ErrorInferentiaInstancesNotSupported(instanceType) + } + + return instanceType, nil +} + func validateInstanceDistribution(instances []string) ([]string, error) { for _, instance := range instances { _, err := validateInstanceType(instance) @@ -1635,6 +1680,7 @@ func (cc *CoreConfig) TelemetryEvent() map[string]interface{} { } event["region"] = cc.Region + event["prometheus_instance_type"] = cc.PrometheusInstanceType if !strings.HasPrefix(cc.ImageOperator, "quay.io/cortexlabs/") { event["image_operator._is_custom"] = true diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go index bbb7e6dc15..d277e8af17 100644 --- a/pkg/types/clusterconfig/config_key.go +++ b/pkg/types/clusterconfig/config_key.go @@ -22,6 +22,7 @@ const ( ClusterNameKey = "cluster_name" RegionKey = "region" + PrometheusInstanceTypeKey = "prometheus_instance_type" NodeGroupsKey = "node_groups" InstanceTypeKey = "instance_type" AcceleratorTypeKey = "accelerator_type" diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index 5c00328654..56d1be5115 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -46,6 +46,8 @@ const ( ErrSpotPriceGreaterThanMaxPrice = "clusterconfig.spot_price_greater_than_max_price" ErrInstanceTypeNotSupportedByCortex = "clusterconfig.instance_type_not_supported_by_cortex" ErrAMDGPUInstancesNotSupported = "clusterconfig.amd_gpu_instances_not_supported" + ErrGPUInstancesNotSupported = "clusterconfig.gpu_instance_not_supported" + ErrInferentiaInstancesNotSupported = "clusterconfig.inferentia_instances_not_supported" ErrMacInstancesNotSupported = "clusterconfig.mac_instances_not_supported" ErrAtLeastOneInstanceDistribution = "clusterconfig.at_least_one_instance_distribution" ErrNoCompatibleSpotInstanceFound = "clusterconfig.no_compatible_spot_instance_found" @@ -202,6 +204,20 @@ func ErrorAMDGPUInstancesNotSupported(instanceType string) error { }) } +func ErrorGPUInstancesNotSupported(instanceType string) error { + return errors.WithStack(&errors.Error{ + Kind: ErrGPUInstancesNotSupported, + Message: fmt.Sprintf("GPU instances (including %s) are not supported", instanceType), + }) +} + +func ErrorInferentiaInstancesNotSupported(instanceType string) error { + return errors.WithStack(&errors.Error{ + Kind: ErrInferentiaInstancesNotSupported, + Message: fmt.Sprintf("Inferentia instances (including %s) are not supported", instanceType), + }) +} + func ErrorMacInstancesNotSupported(instanceType string) error { return errors.WithStack(&errors.Error{ Kind: ErrMacInstancesNotSupported,