From cc1842f7d386ee34958f7d9a8c721d949deeb998 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Mon, 28 Jun 2021 23:46:36 +0300 Subject: [PATCH 01/25] Experiment with dropping metrics/labels --- .../manifests/prometheus-node-exporter.yaml | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index c1a6b0b4a0..cc32089078 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -191,6 +191,59 @@ spec: sourceLabels: - __meta_kubernetes_pod_node_name targetLabel: instance + metricRelabelings: + - action: keeplabel + sourceLabels: [__name__, instance, job] + regex: "node_(\ + cpu_seconds_total|\ + load1|\ + load5|\ + load15|\ + memory_MemTotal_bytes|\ + memory_MemFree_bytes|\ + memory_Buffers_bytes|\ + memory_Cached_bytes|\ + memory_MemAvailable_bytes|\ + disk_read_bytes_total|\ + disk_written_bytes_total|\ + disk_io_time_seconds_total|\ + disk_io_time_weighted_seconds|\ + filesystem_size_bytes|\ + filesystem_avail_bytes|\ + network_receive_bytes_total|\ + network_transmit_bytes_total|\ + network_receive_drop_total|\ + network_transmit_drop_total\ + );(.+)" + - action: keep + sourceLabels: [__name__, device] + regex: "node_(\ + disk_read_bytes_total|\ + disk_written_bytes_total|\ + disk_io_time_seconds_total|\ + disk_io_time_weighted_seconds|\ + filesystem_size_bytes|\ + filesystem_avail_bytes|\ + network_receive_bytes_total|\ + network_transmit_bytes_total|\ + network_receive_drop_total|\ + network_transmit_drop_total\ + );(.+)" + - action: keep + sourceLabels: [__name__, fstype] + regex: "node_(\ + node_filesystem_size_bytes|\ + node_filesystem_avail_bytes\ + );(.+)" + - action: keep + sourceLabels: [__name__, mountpoint] + regex: (node_filesystem_size_bytes);(.+) + - action: keep + sourceLabels: [__name__, mode] + regex: (node_cpu_seconds_total);(.+) + - action: keep + sourceLabels: [__name__] + regex: node_vmstat_pgmajfault scheme: https tlsConfig: insecureSkipVerify: true From ee7c31a77ab8e8341ad147ccde7d7b978ee8b2a9 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Mon, 28 Jun 2021 23:46:52 +0300 Subject: [PATCH 02/25] Fix the dropping of metrics/labels for the node exporter --- .../manifests/prometheus-node-exporter.yaml | 38 +++---------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index cc32089078..4a5ec20cbc 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -192,8 +192,8 @@ spec: - __meta_kubernetes_pod_node_name targetLabel: instance metricRelabelings: - - action: keeplabel - sourceLabels: [__name__, instance, job] + - action: keep + sourceLabels: [__name__] regex: "node_(\ cpu_seconds_total|\ load1|\ @@ -214,36 +214,10 @@ spec: network_transmit_bytes_total|\ network_receive_drop_total|\ network_transmit_drop_total\ - );(.+)" - - action: keep - sourceLabels: [__name__, device] - regex: "node_(\ - disk_read_bytes_total|\ - disk_written_bytes_total|\ - disk_io_time_seconds_total|\ - disk_io_time_weighted_seconds|\ - filesystem_size_bytes|\ - filesystem_avail_bytes|\ - network_receive_bytes_total|\ - network_transmit_bytes_total|\ - network_receive_drop_total|\ - network_transmit_drop_total\ - );(.+)" - - action: keep - sourceLabels: [__name__, fstype] - regex: "node_(\ - node_filesystem_size_bytes|\ - node_filesystem_avail_bytes\ - );(.+)" - - action: keep - sourceLabels: [__name__, mountpoint] - regex: (node_filesystem_size_bytes);(.+) - - action: keep - sourceLabels: [__name__, mode] - regex: (node_cpu_seconds_total);(.+) - - action: keep - sourceLabels: [__name__] - regex: node_vmstat_pgmajfault + node_vmstat_pgmajfault\ + )" + - action: labelkeep + regex: (__name__|instance|job|device|fstype|mountpoint|mode) scheme: https tlsConfig: insecureSkipVerify: true From 4ed459ed09fa0014a222a2066eefa5eae5a22773 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 29 Jun 2021 00:41:10 +0300 Subject: [PATCH 03/25] Drop unnecessary metrics/labels from kubelet --- .../prometheus-kubelet-exporter.yaml | 47 +++++-------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml index 8677500ef9..8982706c42 100644 --- a/manager/manifests/prometheus-kubelet-exporter.yaml +++ b/manager/manifests/prometheus-kubelet-exporter.yaml @@ -27,37 +27,7 @@ spec: interval: 30s metricRelabelings: - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ + sourceLabels: [__name__] port: https-metrics relabelings: - sourceLabels: @@ -71,10 +41,14 @@ spec: honorTimestamps: false interval: 30s metricRelabelings: - - action: drop - regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) - sourceLabels: - - __name__ + - action: keep + sourceLabels: [__name__] + regex: "container_(\ + cpu_usage_seconds_total|\ + memory_working_set_bytes\ + )" + - action: labelkeep + regex: (__name__|pod|container|name) path: /metrics/cadvisor port: https-metrics relabelings: @@ -93,6 +67,9 @@ spec: - sourceLabels: - __metrics_path__ targetLabel: metrics_path + metricRelabelings: + - action: drop + sourceLabels: [__name__] scheme: https tlsConfig: insecureSkipVerify: true From 146c4e858b75601cbaedff3172e194d8355e754a Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 29 Jun 2021 00:52:21 +0300 Subject: [PATCH 04/25] Drop unnecessary kube-state-metrics metrics/labels --- manager/manifests/prometheus-kube-state-metrics.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index 135e5ffdf8..ffa7f92b31 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -270,6 +270,18 @@ spec: scheme: http path: /metrics interval: 30s + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "kube_(\ + pod_container_resource_requests_cpu_cores|\ + pod_container_resource_requests_memory_bytes|\ + pod_info|\ + deployment_status_replicas_available|\ + job_status_active\ + )" + - action: labelkeep + regex: (__name__|exported_pod|job_name) namespaceSelector: any: true selector: From 2f2649743932ed6333599f5721b527252ee7a2a6 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 29 Jun 2021 00:55:30 +0300 Subject: [PATCH 05/25] Drop unnecessary metrics/labels from DCGM exporter --- manager/manifests/prometheus-dcgm-exporter.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml index 3db31338ce..a025b1bc55 100644 --- a/manager/manifests/prometheus-dcgm-exporter.yaml +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -131,6 +131,16 @@ spec: path: /metrics scheme: http interval: 15s + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "DCGM_FI_DEV_(\ + GPU_UTIL|\ + FB_USED|\ + FB_FREE\ + )" + - action: labelkeep + regex: (__name__|exported_pod) namespaceSelector: any: true selector: From 39a639ad17a525f8bd5e950bcbcf3e664c1cabba Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 29 Jun 2021 15:33:44 +0300 Subject: [PATCH 06/25] Remove unnecessary metrics/labels from istio --- manager/manifests/prometheus-monitoring.yaml.j2 | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index fa9aefe277..37e1b57da9 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -160,6 +160,18 @@ spec: - sourceLabels: [ __meta_kubernetes_pod_name ] action: replace targetLabel: pod_name + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "istio_(\ + requests_total|\ + request_duration_milliseconds_bucket|\ + request_duration_milliseconds_sum|\ + request_duration_milliseconds_count\ + )" + - action: labeldrop + sourceLabels: [__name__] + regex: (__name__|destination_service_name|response_code) --- From d4739a08dad59a0e7e401ac669d447ddfeaa1422 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 29 Jun 2021 16:03:53 +0300 Subject: [PATCH 07/25] Change labeldrop to labelkeep --- manager/manifests/prometheus-monitoring.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index 37e1b57da9..3fc137b897 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -169,7 +169,7 @@ spec: request_duration_milliseconds_sum|\ request_duration_milliseconds_count\ )" - - action: labeldrop + - action: labelkeep sourceLabels: [__name__] regex: (__name__|destination_service_name|response_code) From 488af33ded3d19d1caecea101f4869839e8a430e Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 29 Jun 2021 16:38:52 +0300 Subject: [PATCH 08/25] Add development docs --- dev/prometheus.md | 207 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 dev/prometheus.md diff --git a/dev/prometheus.md b/dev/prometheus.md new file mode 100644 index 0000000000..7be8e0834e --- /dev/null +++ b/dev/prometheus.md @@ -0,0 +1,207 @@ +# Metrics + +## Updating metrics + +When new metrics/labels/exporters are added to be scraped by prometheus, make sure the following list **is updated** as well to keep track of what metrics/labels are needed or not. + +The following is a list of metrics that are currently in use. + +#### Cortex metrics + +1. cortex_in_flight_requests with the following labels: + 1. api_name +1. cortex_async_request_count with the following labels: + 1. api_name + 1. api_kind + 1. status_code +1. cortex_async_queue_length with the following labels: + 1. api_name + 1. api_kind +1. cortex_async_latency_bucket with the following labels: + 1. api_name + 1. api_kind +1. cortex_batch_succeeded with the following labels: + 1. api_name +1. cortex_batch_failed with the following labels: + 1. api_name +1. cortex_time_per_batch_sum with the following labels: + 1. api_name +1. cortex_time_per_batch_count with the following labels: + 1. api_name + +#### Istio metrics + +1. istio_requests_total with the following labels: + 1. destination_service_name + 1. response_code +1. istio_request_duration_milliseconds_bucket with the following labels: + 1. destination_service_name +1. istio_request_duration_milliseconds_sum with the following labels: + 1. destination_service_name +1. istio_request_duration_milliseconds_count with the following labels: + 1. destination_service_name + +#### Kubelet metrics +1. container_cpu_usage_seconds_total with the following labels: + 1. pod + 1. container + 1. name +1. container_memory_working_set_bytes with the following labels: + 1. pod + 1. name + 1. container + +#### Kube-state-metrics metrics + +1. kube_pod_container_resource_requests_cpu_cores with the following labels: + 1. exported_pod +1. kube_pod_container_resource_requests_memory_bytes with the following labels: + 1. exported_pod +1. kube_pod_info with the following labels: + 1. exported_pod +1. kube_deployment_status_replicas_available with the following labels: + 1. deployment +1. kube_job_status_active with the following labels: + 1. job_name + +#### DCGM metrics + +1. DCGM_FI_DEV_GPU_UTIL with the following labels: + 1. exported_pod +1. DCGM_FI_DEV_FB_USED with the following labels: + 1. exported_pod +1. DCGM_FI_DEV_FB_FREE with the following labels: + 1. exported_pod + +#### Node metrics + +1. node_cpu_seconds_total with the following labels: + 1. job + 1. mode + 1. instance + 1. cpu +1. node_load1 with the following labels: + 1. job + 1. instance +1. node_load5 with the following labels: + 1. job + 1. instance +1. node_load15 with the following labels: + 1. job + 1. instance +1. node_memory_MemTotal_bytes with the following labels: + 1. job + 1. instance +1. node_memory_MemFree_bytes with the following labels: + 1. job + 1. instance +1. node_memory_Buffers_bytes with the following labels: + 1. job + 1. instance +1. node_memory_Cached_bytes with the following labels: + 1. job + 1. instance +1. node_memory_MemAvailable_bytes with the following labels: + 1. job + 1. instance +1. node_disk_read_bytes_total with the following labels: + 1. job + 1. instance + 1. device +1. node_disk_written_bytes_total with the following labels: + 1. job + 1. instance + 1. device +1. node_disk_io_time_seconds_total with the following labels: + 1. job + 1. instance + 1. device +1. node_filesystem_size_bytes with the following labels: + 1. job + 1. instance + 1. fstype + 1. mountpoint + 1. device +1. node_filesystem_avail_bytes with the following labels: + 1. job + 1. instance + 1. fstype + 1. device +1. node_network_receive_bytes_total with the following labels: + 1. job + 1. instance + 1. device +1. node_network_transmit_bytes_total with the following labels: + 1. job + 1. instance + 1. device + +##### Prometheus rules for the node exporter + +1. instance:node_cpu_utilisation:rate1m from the following metrics: + 1. node_cpu_seconds_total with the following labels: + 1. job + 1. mode +1. instance:node_num_cpu:sum from the following metrics: + 1. node_cpu_seconds_total with the following labels: + 1. job +1. instance:node_load1_per_cpu:ratio from the following metrics: + 1. node_load1 with the following labels: + 1. job +1. instance:node_memory_utilisation:ratio from the following metrics: + 1. node_memory_MemTotal_bytes with the following labels: + 1. job + 1. node_memory_MemAvailable_bytes with the following labels: + 1. job +1. instance:node_vmstat_pgmajfault:rate1m with the following metrics: + 1. node_vmstat_pgmajfault with the following labels: + 1. job +1. instance_device:node_disk_io_time_seconds:rate1m with the following metrics: + 1. node_disk_io_time_seconds_total with the following labels: + 1. job + 1. device +1. instance_device:node_disk_io_time_weighted_seconds:rate1m with the following metrics: + 1. node_disk_io_time_weighted_seconds with the following labels: + 1. job + 1. device +1. instance:node_network_receive_bytes_excluding_lo:rate1m with the following metrics: + 1. node_network_receive_bytes_total with the following labels: + 1. job + 1. device +1. instance:node_network_transmit_bytes_excluding_lo:rate1m with the following metrics: + 1. node_network_transmit_bytes_total with the following labels: + 1. job + 1. device +1. instance:node_network_receive_drop_excluding_lo:rate1m with the following metrics: + 1. node_network_receive_drop_total with the following labels: + 1. job + 1. device +1. instance:node_network_transmit_drop_excluding_lo:rate1m with the following metrics: + 1. node_network_transmit_drop_total with the following labels: + 1. job + 1. device + +## Re-introducing dropped metrics/labels + +If you need to add some metrics/labels back for some particular use case, comment out every `metricRelabelings:` section (except the one from the `prometheus-operator.yaml` file), determine which metrics/labels you want to add back (i.e. by using the explorer from Grafana) and then re-edit the appropriate `metricRelabelings:` sections to account for the un-dropped metrics/labels. + +## Prometheus Analysis + +### Go Pprof + +To analyse the memory allocations of prometheus, run `kubectl port-forward prometheus-prometheus-0 9090:9090`, and then run `go tool pprof -symbolize=remote -inuse_space localhost:9090/debug/pprof/heap`. Once you get the interpreter, you can run `top` or `dot` for a more detailed hierarchy of the memory usage. + +### TSDB + +To analyse the TSDB of prometheus, exec into the `prometheus-prometheus-0` pod, `cd` into `/tmp`, and run the following code-block: + +```bash +wget https://github.com/prometheus/prometheus/releases/download/v1.7.3/prometheus-1.7.3.linux-amd64.tar.gz +tar -xzf prometheus-* +cd prometheus-* +./tsdb analyze /prometheus | less +``` + +*Useful link: https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality* + +Or you can go to `localhost:9090` -> `Status` -> `TSDB Status`, but it's not as complete as running a binary analysis. From ccc80ce693f825d9a0511b73ac96ed8e6f3b00e4 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 29 Jun 2021 19:31:19 +0300 Subject: [PATCH 09/25] Fixes for node-exporter & prom monitoring --- manager/manifests/prometheus-monitoring.yaml.j2 | 1 - manager/manifests/prometheus-node-exporter.yaml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index 3fc137b897..6ceff425cc 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -170,7 +170,6 @@ spec: request_duration_milliseconds_count\ )" - action: labelkeep - sourceLabels: [__name__] regex: (__name__|destination_service_name|response_code) --- diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index 4a5ec20cbc..6506572163 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -217,7 +217,7 @@ spec: node_vmstat_pgmajfault\ )" - action: labelkeep - regex: (__name__|instance|job|device|fstype|mountpoint|mode) + regex: (__name__|instance|job|device|fstype|mountpoint|mode) scheme: https tlsConfig: insecureSkipVerify: true From b382a1f6903e6bbe6e2c71a88bf77e8a5fcd5b9c Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 29 Jun 2021 20:01:30 +0300 Subject: [PATCH 10/25] Fixes to the nodes dashboard --- dev/prometheus.md | 3 +++ manager/manifests/prometheus-node-exporter.yaml | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dev/prometheus.md b/dev/prometheus.md index 7be8e0834e..a96ad04723 100644 --- a/dev/prometheus.md +++ b/dev/prometheus.md @@ -89,6 +89,9 @@ The following is a list of metrics that are currently in use. 1. node_load15 with the following labels: 1. job 1. instance +1. node_exporter_build_info with the following labels: + 1. job + 1. instance 1. node_memory_MemTotal_bytes with the following labels: 1. job 1. instance diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index 6506572163..27fc75c3c8 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -199,6 +199,7 @@ spec: load1|\ load5|\ load15|\ + exporter_build_info|\ memory_MemTotal_bytes|\ memory_MemFree_bytes|\ memory_Buffers_bytes|\ @@ -207,14 +208,14 @@ spec: disk_read_bytes_total|\ disk_written_bytes_total|\ disk_io_time_seconds_total|\ - disk_io_time_weighted_seconds|\ + disk_io_time_weighted_seconds_total|\ filesystem_size_bytes|\ filesystem_avail_bytes|\ network_receive_bytes_total|\ network_transmit_bytes_total|\ network_receive_drop_total|\ - network_transmit_drop_total\ - node_vmstat_pgmajfault\ + network_transmit_drop_total|\ + vmstat_pgmajfault\ )" - action: labelkeep regex: (__name__|instance|job|device|fstype|mountpoint|mode) From 6527b4093f3f494cfedba0bf9dd0c3b408a49140 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 30 Jun 2021 03:08:26 +0300 Subject: [PATCH 11/25] Add missing `le` label for `istio_request_duration_milliseconds_bucket` metric --- dev/prometheus.md | 1 + manager/manifests/prometheus-monitoring.yaml.j2 | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/prometheus.md b/dev/prometheus.md index a96ad04723..33eb67937e 100644 --- a/dev/prometheus.md +++ b/dev/prometheus.md @@ -36,6 +36,7 @@ The following is a list of metrics that are currently in use. 1. response_code 1. istio_request_duration_milliseconds_bucket with the following labels: 1. destination_service_name + 1. le 1. istio_request_duration_milliseconds_sum with the following labels: 1. destination_service_name 1. istio_request_duration_milliseconds_count with the following labels: diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index 6ceff425cc..6cf1b7c19b 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -170,7 +170,7 @@ spec: request_duration_milliseconds_count\ )" - action: labelkeep - regex: (__name__|destination_service_name|response_code) + regex: (__name__|destination_service_name|response_code|le) --- From 35c5bb7d3024bffb79d118f52ff3e47a8d2d4dd5 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 30 Jun 2021 16:26:31 +0300 Subject: [PATCH 12/25] Add required label for kube-state-metrics exporter --- dev/prometheus.md | 2 ++ manager/manifests/prometheus-kube-state-metrics.yaml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dev/prometheus.md b/dev/prometheus.md index 33eb67937e..9330591c4b 100644 --- a/dev/prometheus.md +++ b/dev/prometheus.md @@ -56,8 +56,10 @@ The following is a list of metrics that are currently in use. 1. kube_pod_container_resource_requests_cpu_cores with the following labels: 1. exported_pod + 1. exported_container (required for not dropping the values for each container of each pod) 1. kube_pod_container_resource_requests_memory_bytes with the following labels: 1. exported_pod + 1. exported_container (required for not dropping the values for each container of each pod) 1. kube_pod_info with the following labels: 1. exported_pod 1. kube_deployment_status_replicas_available with the following labels: diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index ffa7f92b31..36e542ae22 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -281,7 +281,7 @@ spec: job_status_active\ )" - action: labelkeep - regex: (__name__|exported_pod|job_name) + regex: (__name__|exported_pod|exported_container|job_name) namespaceSelector: any: true selector: From 6ba99b9b644d3c505a13f9f0026361eee71fed19 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 30 Jun 2021 22:03:49 +0300 Subject: [PATCH 13/25] Fix batch grafana dashboard --- manager/manifests/grafana/grafana-dashboard-batch.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml index 565513d0c4..b13ad328af 100644 --- a/manager/manifests/grafana/grafana-dashboard-batch.yaml +++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml @@ -36,7 +36,8 @@ data: "editable": true, "gnetId": null, "graphTooltip": 0, - "iteration": 1617119656448, + "id": 4, + "iteration": 1625068140971, "links": [], "panels": [ { @@ -427,13 +428,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"})", + "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"} != 0)", "interval": "", "legendFormat": "Active Jobs", "refId": "Active Batches" }, { - "expr": "sum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "expr": "sum(kube_job_status_active{job_name=~\"$api_name.+\"})", "hide": false, "interval": "", "legendFormat": "Active Workers", From 416bed95f86b4a2979ed3ab2df0b6befcc4bac5f Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 16:35:10 +0300 Subject: [PATCH 14/25] Keep cortex_* metrics --- manager/manifests/prometheus-monitoring.yaml.j2 | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index 7b05b1cd2e..c25c084f1a 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -220,6 +220,10 @@ spec: - sourceLabels: [ __meta_kubernetes_pod_name ] action: replace targetLabel: pod_name + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "cortex_(.+)" --- @@ -270,6 +274,10 @@ spec: - sourceLabels: [ __meta_kubernetes_pod_name ] action: replace targetLabel: pod_name + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "cortex_(.+)" --- @@ -287,6 +295,10 @@ spec: scheme: http path: /metrics interval: 20s + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "cortex_(.+)" namespaceSelector: any: true selector: @@ -309,6 +321,10 @@ spec: scheme: http path: /metrics interval: 10s + metricRelabelings: + - action: keep + sourceLabels: [__name__] + regex: "cortex_(.+)" namespaceSelector: any: true selector: From f465bc111a66ee3661b3696d1c9754ff3c83c79c Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 17:14:25 +0300 Subject: [PATCH 15/25] Separate prometheus and operator workloads --- manager/generate_eks.py | 26 ++++++++++++++++--- manager/manifests/cluster-autoscaler.yaml.j2 | 6 ++--- manager/manifests/fluent-bit.yaml.j2 | 3 +++ manager/manifests/grafana/grafana.yaml | 6 +++++ manager/manifests/istio.yaml.j2 | 17 +++++++----- manager/manifests/operator.yaml.j2 | 4 +-- .../prometheus-kube-state-metrics.yaml | 6 +++++ .../manifests/prometheus-monitoring.yaml.j2 | 6 +++++ .../manifests/prometheus-node-exporter.yaml | 5 +++- manager/manifests/prometheus-operator.yaml | 5 ++++ .../manifests/prometheus-statsd-exporter.yaml | 6 +++++ pkg/crds/config/manager/manager.yaml | 8 +++--- 12 files changed, 77 insertions(+), 21 deletions(-) diff --git a/manager/generate_eks.py b/manager/generate_eks.py index 38d513f168..1ab0384c1e 100644 --- a/manager/generate_eks.py +++ b/manager/generate_eks.py @@ -305,13 +305,14 @@ def generate_eks( return operator_nodegroup = default_nodegroup(cluster_config) + # TODO validate requests when clustering up operator_settings = { "ami": get_ami(ami_map, "t3.medium"), "name": "cx-operator", "instanceType": "t3.medium", - "minSize": 2, - "maxSize": 2, - "desiredCapacity": 2, + "minSize": 1, + "maxSize": 25, + "desiredCapacity": 1, "volumeType": "gp3", "volumeSize": 20, "volumeIOPS": 3000, @@ -319,6 +320,23 @@ def generate_eks( } operator_nodegroup = merge_override(operator_nodegroup, operator_settings) + prometheus_nodegroup = default_nodegroup(cluster_config) + prometheus_settings = { + "ami": get_ami(ami_map, "t3.xlarge"), + "name": "cx-prometheus", + "instanceType": "t3.xlarge", + "minSize": 1, + "maxSize": 1, + "desiredCapacity": 1, + "volumeType": "gp3", + "volumeSize": 20, + "volumeIOPS": 3000, + "volumeThroughput": 125, + "labels": {"prometheus": "true"}, + "taints": {"prometheus": "true:NoSchedule"}, + } + prometheus_nodegroup = merge_override(prometheus_nodegroup, prometheus_settings) + worker_nodegroups = get_all_worker_nodegroups(ami_map, cluster_config) nat_gateway = "Disable" @@ -337,7 +355,7 @@ def generate_eks( "tags": cluster_config["tags"], }, "vpc": {"nat": {"gateway": nat_gateway}}, - "nodeGroups": [operator_nodegroup] + worker_nodegroups, + "nodeGroups": [operator_nodegroup, prometheus_nodegroup] + worker_nodegroups, "addons": [ { "name": "vpc-cni", diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2 index e529e36918..6c9ad79205 100644 --- a/manager/manifests/cluster-autoscaler.yaml.j2 +++ b/manager/manifests/cluster-autoscaler.yaml.j2 @@ -169,11 +169,11 @@ spec: name: cluster-autoscaler resources: limits: - cpu: 100m - memory: 300Mi + cpu: 300m + memory: 1Gi requests: cpu: 100m - memory: 300Mi + memory: 200Mi command: - ./cluster-autoscaler - --v=4 diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2 index a860659bc7..a2e1140f2c 100644 --- a/manager/manifests/fluent-bit.yaml.j2 +++ b/manager/manifests/fluent-bit.yaml.j2 @@ -249,3 +249,6 @@ spec: - key: workload operator: Exists effect: NoSchedule + - key: prometheus + operator: Exists + effect: NoSchedule diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml index 91f10b5a13..83a5b73d32 100644 --- a/manager/manifests/grafana/grafana.yaml +++ b/manager/manifests/grafana/grafana.yaml @@ -173,6 +173,12 @@ spec: - name: grafana-dashboard-nodes configMap: name: grafana-dashboard-nodes + nodeSelector: + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule affinity: podAffinity: preferredDuringSchedulingIgnoredDuringExecution: diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2 index bf0a60af41..811bfbf1d2 100644 --- a/manager/manifests/istio.yaml.j2 +++ b/manager/manifests/istio.yaml.j2 @@ -25,8 +25,8 @@ spec: k8s: resources: requests: - cpu: 200m # default is 500m - memory: 1.75Gi # default is 2048Mi == 2Gi + cpu: 100m # default is 500m + memory: 200Mi # default is 2048Mi == 2Gi cni: enabled: false ingressGateways: @@ -74,7 +74,7 @@ spec: cpu: 100m memory: 128Mi limits: - cpu: 2000m + cpu: 1000m memory: 1024Mi replicaCount: 1 hpaSpec: @@ -132,20 +132,23 @@ spec: targetPort: 15443 resources: requests: - cpu: 200m + cpu: 300m memory: 128Mi limits: - cpu: 2000m + cpu: 1500m memory: 1024Mi replicaCount: 1 hpaSpec: minReplicas: 1 - maxReplicas: 1 # edit autoscaleEnabled in values if increasing this + maxReplicas: 100 # edit autoscaleEnabled in values if increasing this metrics: - type: Resource resource: name: cpu - targetAverageUtilization: 80 + targetAverageUtilization: 90 + resource: + name: mem + targetAverageUtilization: 90 scaleTargetRef: apiVersion: apps/v1 kind: Deployment diff --git a/manager/manifests/operator.yaml.j2 b/manager/manifests/operator.yaml.j2 index c5501cb1d8..3ccd8eea76 100644 --- a/manager/manifests/operator.yaml.j2 +++ b/manager/manifests/operator.yaml.j2 @@ -58,10 +58,10 @@ spec: imagePullPolicy: Always resources: requests: - cpu: 200m + cpu: 100m memory: 128Mi limits: - cpu: 2000m + cpu: 1500m memory: 1024Mi ports: - containerPort: 8888 diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index 95c995ef48..ba5165ff6e 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -234,6 +234,12 @@ spec: port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 + nodeSelector: + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule --- apiVersion: monitoring.coreos.com/v1 kind: PodMonitor diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index c25c084f1a..b0d76fbc34 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -30,6 +30,12 @@ metadata: spec: image: {{ config['image_prometheus'] }} serviceAccountName: prometheus + nodeSelector: + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule podMonitorSelector: matchExpressions: - key: "monitoring.cortex.dev" diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index b58e92a5d9..2ea1881eb7 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -153,12 +153,15 @@ spec: hostPID: true nodeSelector: kubernetes.io/os: linux + prometheus: "true" securityContext: runAsNonRoot: true runAsUser: 65534 serviceAccountName: node-exporter tolerations: - - operator: Exists + - key: prometheus + operator: Exists + effect: NoSchedule volumes: - hostPath: path: /sys diff --git a/manager/manifests/prometheus-operator.yaml b/manager/manifests/prometheus-operator.yaml index ddeff6e1c9..3b7b558318 100644 --- a/manager/manifests/prometheus-operator.yaml +++ b/manager/manifests/prometheus-operator.yaml @@ -14199,6 +14199,11 @@ spec: allowPrivilegeEscalation: false nodeSelector: kubernetes.io/os: linux + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manager/manifests/prometheus-statsd-exporter.yaml b/manager/manifests/prometheus-statsd-exporter.yaml index b96a700ea6..ea58db52d8 100644 --- a/manager/manifests/prometheus-statsd-exporter.yaml +++ b/manager/manifests/prometheus-statsd-exporter.yaml @@ -74,6 +74,12 @@ spec: volumeMounts: - name: statsd-mapping-config mountPath: /etc/prometheus-statsd-exporter + nodeSelector: + prometheus: "true" + tolerations: + - key: prometheus + operator: Exists + effect: NoSchedule volumes: - name: statsd-mapping-config configMap: diff --git a/pkg/crds/config/manager/manager.yaml b/pkg/crds/config/manager/manager.yaml index c1a9c7d25c..75f03b40d3 100644 --- a/pkg/crds/config/manager/manager.yaml +++ b/pkg/crds/config/manager/manager.yaml @@ -46,11 +46,11 @@ spec: periodSeconds: 10 resources: limits: - cpu: 100m - memory: 30Mi + cpu: 200m + memory: 100Mi requests: - cpu: 100m - memory: 20Mi + cpu: 200m + memory: 80Mi volumeMounts: - mountPath: /mnt/cluster.yaml name: cluster-config From 4289fc6f5e660101c36877c72f90c38d8cd6da57 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 17:26:47 +0300 Subject: [PATCH 16/25] Validate operator/prometheus node group quotas --- manager/generate_eks.py | 1 - pkg/lib/aws/servicequotas.go | 4 ++-- pkg/types/clusterconfig/cluster_config.go | 16 +++++++++++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/manager/generate_eks.py b/manager/generate_eks.py index 1ab0384c1e..261a54d92b 100644 --- a/manager/generate_eks.py +++ b/manager/generate_eks.py @@ -305,7 +305,6 @@ def generate_eks( return operator_nodegroup = default_nodegroup(cluster_config) - # TODO validate requests when clustering up operator_settings = { "ami": get_ami(ami_map, "t3.medium"), "name": "cx-operator", diff --git a/pkg/lib/aws/servicequotas.go b/pkg/lib/aws/servicequotas.go index ef99e1787b..d1bc45b867 100644 --- a/pkg/lib/aws/servicequotas.go +++ b/pkg/lib/aws/servicequotas.go @@ -328,9 +328,9 @@ func requiredRulesForNodeGroupSecurityGroup(numAZs, whitelistLength int) int { } func requiredRulesForControlPlaneSecurityGroup(numNodeGroups int) int { - // +1 for the operator node group + // +2 for the operator and prometheus node groups // this is the number of outbound rules (there are half as many inbound rules, so that is not the limiting factor) - return 2 * (numNodeGroups + 1) + return 2 * (numNodeGroups + 2) } func requiredSecurityGroups(numNodeGroups int, clusterAlreadyExists bool) int { diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go index 26e5bfc90f..0a530fed0b 100644 --- a/pkg/types/clusterconfig/cluster_config.go +++ b/pkg/types/clusterconfig/cluster_config.go @@ -55,6 +55,11 @@ const ( ) var ( + _operatorNodeGroupInstanceType = "t3.medium" + _operatorNodeGroupRequiredOnDemand = int64(25) + _prometheusNodeGroupInstanceType = "t3.xlarge" + _prometheusNodeGroupRequiredOnDemand = int64(1) + _maxNodeGroupLengthWithPrefix = 32 _maxNodeGroupLength = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws- _maxInstancePools = 20 @@ -904,7 +909,16 @@ func (cc *Config) validate(awsClient *aws.Client) error { } ngNames := []string{} - instances := []aws.InstanceTypeRequests{} + instances := []aws.InstanceTypeRequests{ + { + InstanceType: _operatorNodeGroupInstanceType, + RequiredOnDemandInstances: int64(_operatorNodeGroupRequiredOnDemand), + }, + { + InstanceType: _prometheusNodeGroupInstanceType, + RequiredOnDemandInstances: int64(_prometheusNodeGroupRequiredOnDemand), + }, + } for _, nodeGroup := range cc.NodeGroups { // setting max_instances to 0 during cluster creation is not permitted (but scaling max_instances to 0 afterwards is allowed) if nodeGroup.MaxInstances == 0 { From a5be71fb08ed8e1c1423483d044af09c916559f8 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 18:57:48 +0300 Subject: [PATCH 17/25] Address cluster info pricing --- cli/cmd/cluster.go | 30 +++++++++------ cli/cmd/lib_cluster_config.go | 16 +++++--- manager/generate_eks.py | 1 + pkg/operator/endpoints/info.go | 69 ++++++++++++++++++++++++++++------ pkg/operator/schema/schema.go | 23 +++++++----- 5 files changed, 102 insertions(+), 37 deletions(-) diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 1616565b71..cd30b8a75a 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -770,11 +770,12 @@ func cmdInfo(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, st infoInterface = infoResponse.ClusterConfig.Config } else { infoInterface = map[string]interface{}{ - "cluster_config": infoResponse.ClusterConfig.Config, - "cluster_metadata": infoResponse.ClusterConfig.OperatorMetadata, - "node_infos": infoResponse.NodeInfos, - "endpoint_operator": operatorEndpoint, - "endpoint_api": apiEndpoint, + "cluster_config": infoResponse.ClusterConfig.Config, + "cluster_metadata": infoResponse.ClusterConfig.OperatorMetadata, + "worker_node_infos": infoResponse.WorkerNodeInfos, + "operator_node_infos": infoResponse.OperatorNodeInfos, + "endpoint_operator": operatorEndpoint, + "endpoint_api": apiEndpoint, } } @@ -848,6 +849,8 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco eksPrice := aws.EKSPrices[clusterConfig.Region] operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 + prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.xlarge"].Price + prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24 nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price @@ -891,17 +894,20 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco totalNodeGroupsPrice += totalEBSPrice + totalInstancePrice } + operatorNodeGroupPrice := float64(len(infoResponse.OperatorNodeInfos)) * (operatorInstancePrice + operatorEBSPrice) + prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + var natTotalPrice float64 if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { natTotalPrice = natUnitPrice } else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway { natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones)) } - totalPrice := eksPrice + totalNodeGroupsPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + nlbPrice*2 + natTotalPrice + totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice)) - operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice - rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)}) + rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)}) + rows = append(rows, []interface{}{"1 t3.xlarge instance (cortex system)", s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"}) if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { @@ -919,11 +925,11 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco } func printInfoNodes(infoResponse *schema.InfoResponse) { - numAPIInstances := len(infoResponse.NodeInfos) + numAPIInstances := len(infoResponse.WorkerNodeInfos) var totalReplicas int var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool - for _, nodeInfo := range infoResponse.NodeInfos { + for _, nodeInfo := range infoResponse.WorkerNodeInfos { totalReplicas += nodeInfo.NumReplicas if nodeInfo.ComputeUserCapacity.GPU > 0 { doesClusterHaveGPUs = true @@ -946,7 +952,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) { fmt.Printf(console.Bold("\nyour cluster has %d API %s running across %d %s%s\n"), totalReplicas, s.PluralS("replica", totalReplicas), numAPIInstances, s.PluralS("instance", numAPIInstances), pendingReplicasStr) - if len(infoResponse.NodeInfos) == 0 { + if len(infoResponse.WorkerNodeInfos) == 0 { return } @@ -963,7 +969,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) { } var rows [][]interface{} - for _, nodeInfo := range infoResponse.NodeInfos { + for _, nodeInfo := range infoResponse.WorkerNodeInfos { lifecycle := "on-demand" if nodeInfo.IsSpot { lifecycle = "spot" diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go index 97a9b86058..39a00a642a 100644 --- a/cli/cmd/lib_cluster_config.go +++ b/cli/cmd/lib_cluster_config.go @@ -163,7 +163,9 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) { eksPrice := aws.EKSPrices[clusterConfig.Region] operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price + prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.xlarge"].Price operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 + prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24 nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price @@ -184,9 +186,10 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)}) ngNameToSpotInstancesUsed := map[string]int{} - fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + 2*nlbPrice + natTotalPrice - totalMinPrice := fixedPrice - totalMaxPrice := fixedPrice + baseMinPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice + baseMaxPrice := eksPrice + 25*(operatorInstancePrice+operatorEBSPrice) + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice + totalMinPrice := baseMinPrice + totalMaxPrice := baseMaxPrice for _, ng := range clusterConfig.NodeGroups { apiInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][ng.InstanceType].Price apiEBSPrice := aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceGB * float64(ng.InstanceVolumeSize) / 30 / 24 @@ -223,8 +226,11 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr}) } - operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice - rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)}) + minOperatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice + maxOperatorNodeGroupPrice := 25 * (operatorInstancePrice + operatorEBSPrice) + prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + rows = append(rows, []interface{}{"1-25 t3.medium instances (cortex system)", fmt.Sprintf("%s - %s (depending on load)", s.DollarsAndTenthsOfCents(minOperatorNodeGroupPrice), s.DollarsAndTenthsOfCents(maxOperatorNodeGroupPrice))}) + rows = append(rows, []interface{}{"1 t3.xlarge instance (cortex system)", s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"}) if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { diff --git a/manager/generate_eks.py b/manager/generate_eks.py index 261a54d92b..89f81b7dce 100644 --- a/manager/generate_eks.py +++ b/manager/generate_eks.py @@ -316,6 +316,7 @@ def generate_eks( "volumeSize": 20, "volumeIOPS": 3000, "volumeThroughput": 125, + "labels": {"operator": "true"}, } operator_nodegroup = merge_override(operator_nodegroup, operator_settings) diff --git a/pkg/operator/endpoints/info.go b/pkg/operator/endpoints/info.go index b83a6374e0..c210e74fe7 100644 --- a/pkg/operator/endpoints/info.go +++ b/pkg/operator/endpoints/info.go @@ -31,7 +31,13 @@ import ( ) func Info(w http.ResponseWriter, r *http.Request) { - nodeInfos, numPendingReplicas, err := getNodeInfos() + workerNodeInfos, numPendingReplicas, err := getWorkerNodeInfos() + if err != nil { + respondError(w, r, err) + return + } + + operatorNodeInfos, err := getOperatorNodeInfos() if err != nil { respondError(w, r, err) return @@ -44,13 +50,14 @@ func Info(w http.ResponseWriter, r *http.Request) { response := schema.InfoResponse{ ClusterConfig: fullClusterConfig, - NodeInfos: nodeInfos, + WorkerNodeInfos: workerNodeInfos, + OperatorNodeInfos: operatorNodeInfos, NumPendingReplicas: numPendingReplicas, } respondJSON(w, r, response) } -func getNodeInfos() ([]schema.NodeInfo, int, error) { +func getWorkerNodeInfos() ([]schema.WorkerNodeInfo, int, error) { pods, err := config.K8sAllNamspaces.ListPods(nil) if err != nil { return nil, 0, err @@ -61,8 +68,8 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) { return nil, 0, err } - nodeInfoMap := make(map[string]*schema.NodeInfo, len(nodes)) // node name -> info - spotPriceCache := make(map[string]float64) // instance type -> spot price + nodeInfoMap := make(map[string]*schema.WorkerNodeInfo, len(nodes)) // node name -> info + spotPriceCache := make(map[string]float64) // instance type -> spot price for i := range nodes { node := nodes[i] @@ -86,12 +93,14 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) { } } - nodeInfoMap[node.Name] = &schema.NodeInfo{ + nodeInfoMap[node.Name] = &schema.WorkerNodeInfo{ + NodeInfo: schema.NodeInfo{ + NodeGroupName: nodeGroupName, + InstanceType: instanceType, + IsSpot: isSpot, + Price: price, + }, Name: node.Name, - NodeGroupName: nodeGroupName, - InstanceType: instanceType, - IsSpot: isSpot, - Price: price, NumReplicas: 0, // will be added to below ComputeUserCapacity: nodeComputeAllocatable(&node), // will be subtracted from below ComputeAvailable: nodeComputeAllocatable(&node), // will be subtracted from below @@ -160,7 +169,7 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) { sort.Strings(nodeNames) - nodeInfos := make([]schema.NodeInfo, len(nodeNames)) + nodeInfos := make([]schema.WorkerNodeInfo, len(nodeNames)) for i, nodeName := range nodeNames { nodeInfos[i] = *nodeInfoMap[nodeName] } @@ -179,3 +188,41 @@ func nodeComputeAllocatable(node *kcore.Node) userconfig.Compute { Inf: infQty.Value(), } } + +func getOperatorNodeInfos() ([]schema.NodeInfo, error) { + nodes, err := config.K8sAllNamspaces.ListNodesByLabel("operator", "true") + if err != nil { + return nil, err + } + + nodeInfoMap := make(map[string]*schema.NodeInfo, len(nodes)) // node name -> info + + for i := range nodes { + node := nodes[i] + + instanceType := node.Labels["beta.kubernetes.io/instance-type"] + nodeGroupName := node.Labels["alpha.eksctl.io/nodegroup-name"] + + price := aws.InstanceMetadatas[config.ClusterConfig.Region][instanceType].Price + + nodeInfoMap[node.Name] = &schema.NodeInfo{ + NodeGroupName: nodeGroupName, + InstanceType: instanceType, + Price: price, + } + } + + nodeNames := make([]string, 0, len(nodeInfoMap)) + for nodeName := range nodeInfoMap { + nodeNames = append(nodeNames, nodeName) + } + + sort.Strings(nodeNames) + + nodeInfos := make([]schema.NodeInfo, len(nodeNames)) + for i, nodeName := range nodeNames { + nodeInfos[i] = *nodeInfoMap[nodeName] + } + + return nodeInfos, nil +} diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index 18988fa277..d86073e4e7 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -26,16 +26,14 @@ import ( type InfoResponse struct { ClusterConfig clusterconfig.InternalConfig `json:"cluster_config" yaml:"cluster_config"` - NodeInfos []NodeInfo `json:"node_infos" yaml:"node_infos"` + WorkerNodeInfos []WorkerNodeInfo `json:"worker_node_infos" yaml:"worker_node_infos"` + OperatorNodeInfos []NodeInfo `json:"operator_node_infos" yaml:"operator_node_infos"` NumPendingReplicas int `json:"num_pending_replicas" yaml:"num_pending_replicas"` } -type NodeInfo struct { +type WorkerNodeInfo struct { + NodeInfo Name string `json:"name" yaml:"name"` - NodeGroupName string `json:"nodegroup_name" yaml:"nodegroup_name"` - InstanceType string `json:"instance_type" yaml:"instance_type"` - IsSpot bool `json:"is_spot" yaml:"is_spot"` - Price float64 `json:"price" yaml:"price"` NumReplicas int `json:"num_replicas" yaml:"num_replicas"` NumAsyncGatewayReplicas int `json:"num_async_gateway_replicas" yaml:"num_async_gateway_replicas"` NumEnqueuerReplicas int `json:"num_enqueuer_replicas" yaml:"num_enqueuer_replicas"` @@ -44,6 +42,13 @@ type NodeInfo struct { ComputeUserRequested userconfig.Compute `json:"compute_user_requested" yaml:"compute_user_requested"` // total resources requested by user on a node } +type NodeInfo struct { + NodeGroupName string `json:"nodegroup_name" yaml:"nodegroup_name"` + InstanceType string `json:"instance_type" yaml:"instance_type"` + IsSpot bool `json:"is_spot" yaml:"is_spot"` + Price float64 `json:"price" yaml:"price"` +} + type DeployResult struct { API *APIResponse `json:"api"` Message string `json:"message"` @@ -97,9 +102,9 @@ type APIVersion struct { type VerifyCortexResponse struct{} -func (ir InfoResponse) GetNodesWithNodeGroupName(ngName string) []NodeInfo { - nodesInfo := []NodeInfo{} - for _, nodeInfo := range ir.NodeInfos { +func (ir InfoResponse) GetNodesWithNodeGroupName(ngName string) []WorkerNodeInfo { + nodesInfo := []WorkerNodeInfo{} + for _, nodeInfo := range ir.WorkerNodeInfos { if nodeInfo.NodeGroupName == ngName { nodesInfo = append(nodesInfo, nodeInfo) } From d0b22fc9c4b2c6790f98161c9a2accd8eb20cf0f Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 19:07:12 +0300 Subject: [PATCH 18/25] Let the node exporter run on every node --- manager/manifests/prometheus-node-exporter.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index 2ea1881eb7..b58e92a5d9 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -153,15 +153,12 @@ spec: hostPID: true nodeSelector: kubernetes.io/os: linux - prometheus: "true" securityContext: runAsNonRoot: true runAsUser: 65534 serviceAccountName: node-exporter tolerations: - - key: prometheus - operator: Exists - effect: NoSchedule + - operator: Exists volumes: - hostPath: path: /sys From dc79cbfd1860f96416540bcc7ec1a18a1e701c17 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 19:30:28 +0300 Subject: [PATCH 19/25] Fix istio hpa --- manager/manifests/istio.yaml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2 index 811bfbf1d2..629f49bb12 100644 --- a/manager/manifests/istio.yaml.j2 +++ b/manager/manifests/istio.yaml.j2 @@ -146,6 +146,7 @@ spec: resource: name: cpu targetAverageUtilization: 90 + - type: Resource resource: name: mem targetAverageUtilization: 90 From d2c26e59ec67a869b82e812fec0993d13e58ec63 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 21:07:16 +0300 Subject: [PATCH 20/25] Change resource requests/limits --- manager/manifests/inferentia.yaml | 2 +- manager/manifests/prometheus-node-exporter.yaml | 2 +- pkg/crds/config/manager/manager.yaml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/manager/manifests/inferentia.yaml b/manager/manifests/inferentia.yaml index eb6a7db974..51331f6715 100644 --- a/manager/manifests/inferentia.yaml +++ b/manager/manifests/inferentia.yaml @@ -140,7 +140,7 @@ spec: value: "12345" resources: requests: - cpu: 100m + cpu: 50m memory: 100Mi --- diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index b58e92a5d9..9c3a483f81 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -115,7 +115,7 @@ spec: cpu: 250m memory: 180Mi requests: - cpu: 100m + cpu: 40m memory: 180Mi volumeMounts: - mountPath: /host/sys diff --git a/pkg/crds/config/manager/manager.yaml b/pkg/crds/config/manager/manager.yaml index 75f03b40d3..48fb8ad0e1 100644 --- a/pkg/crds/config/manager/manager.yaml +++ b/pkg/crds/config/manager/manager.yaml @@ -46,10 +46,10 @@ spec: periodSeconds: 10 resources: limits: - cpu: 200m + cpu: 300m memory: 100Mi requests: - cpu: 200m + cpu: 100m memory: 80Mi volumeMounts: - mountPath: /mnt/cluster.yaml From 25f51668dcdf532a007d270585367c54aa648c63 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 22:52:36 +0300 Subject: [PATCH 21/25] Have the prometheus instance type configurable --- cli/cmd/cluster.go | 4 +- cli/cmd/lib_cluster_config.go | 4 +- docs/clusters/management/create.md | 6 ++- manager/generate_eks.py | 5 ++- pkg/lib/aws/ec2.go | 40 ++++++++++++++++++ pkg/types/clusterconfig/cluster_config.go | 49 +++++++++++++++++++---- pkg/types/clusterconfig/config_key.go | 1 + pkg/types/clusterconfig/errors.go | 16 ++++++++ 8 files changed, 110 insertions(+), 15 deletions(-) diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index cd30b8a75a..7d87bf7d3f 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -849,7 +849,7 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco eksPrice := aws.EKSPrices[clusterConfig.Region] operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 - prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.xlarge"].Price + prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24 nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price @@ -907,7 +907,7 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice)) rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)}) - rows = append(rows, []interface{}{"1 t3.xlarge instance (cortex system)", s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) + rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (cortex system)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"}) if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go index 39a00a642a..5579e78b35 100644 --- a/cli/cmd/lib_cluster_config.go +++ b/cli/cmd/lib_cluster_config.go @@ -163,7 +163,7 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) { eksPrice := aws.EKSPrices[clusterConfig.Region] operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price - prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.xlarge"].Price + prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24 metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24 @@ -230,7 +230,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient maxOperatorNodeGroupPrice := 25 * (operatorInstancePrice + operatorEBSPrice) prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice rows = append(rows, []interface{}{"1-25 t3.medium instances (cortex system)", fmt.Sprintf("%s - %s (depending on load)", s.DollarsAndTenthsOfCents(minOperatorNodeGroupPrice), s.DollarsAndTenthsOfCents(maxOperatorNodeGroupPrice))}) - rows = append(rows, []interface{}{"1 t3.xlarge instance (cortex system)", s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) + rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (cortex system)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"}) if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md index a3abecdd85..8bed03e645 100644 --- a/docs/clusters/management/create.md +++ b/docs/clusters/management/create.md @@ -86,11 +86,15 @@ tags: # : map of key/value pairs # SSL certificate ARN (only necessary when using a custom domain) ssl_certificate_arn: -# List of IAM policies to attach to your Cortex APIs +# list of IAM policies to attach to your Cortex APIs iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"] # primary CIDR block for the cluster's VPC vpc_cidr: 192.168.0.0/16 + +# instance type for prometheus +# use a bigger instance if the cluster is expected to grow past the thousand node (or api replica) threshold +prometheus_instance_type: "t3.medium" ``` The docker images used by the cluster can also be overridden. They can be configured by adding any of these keys to your cluster configuration file (default values are shown): diff --git a/manager/generate_eks.py b/manager/generate_eks.py index 89f81b7dce..aa22c9a9a9 100644 --- a/manager/generate_eks.py +++ b/manager/generate_eks.py @@ -273,6 +273,7 @@ def generate_eks( cluster_config = yaml.safe_load(cluster_config_file) region = cluster_config["region"] name = cluster_config["cluster_name"] + prometheus_instance_type = cluster_config["prometheus_instance_type"] ami_map = json.load(ami_json_file)[K8S_VERSION][region] eks = { @@ -322,9 +323,9 @@ def generate_eks( prometheus_nodegroup = default_nodegroup(cluster_config) prometheus_settings = { - "ami": get_ami(ami_map, "t3.xlarge"), + "ami": get_ami(ami_map, prometheus_instance_type), "name": "cx-prometheus", - "instanceType": "t3.xlarge", + "instanceType": prometheus_instance_type, "minSize": 1, "maxSize": 1, "desiredCapacity": 1, diff --git a/pkg/lib/aws/ec2.go b/pkg/lib/aws/ec2.go index 338a719c91..66b2f94fb6 100644 --- a/pkg/lib/aws/ec2.go +++ b/pkg/lib/aws/ec2.go @@ -137,6 +137,46 @@ func IsAMDGPUInstance(instanceType string) (bool, error) { return false, nil } +func IsNvidiaGPUInstance(instanceType string) (bool, error) { + parsedType, err := ParseInstanceType(instanceType) + if err != nil { + return false, err + } + + if !_gpuInstanceFamilies.Has(parsedType.Family) { + return false, nil + } + + if !parsedType.Capabilities.Has("a") { + return true, nil + } + + return false, nil +} + +func IsGPUInstance(instanceType string) (bool, error) { + isAMDGPU, err := IsAMDGPUInstance(instanceType) + if err != nil { + return false, err + } + + isNvidiaGPU, err := IsNvidiaGPUInstance(instanceType) + if err != nil { + return false, err + } + + return isAMDGPU || isNvidiaGPU, nil +} + +func IsInferentiaInstance(instanceType string) (bool, error) { + parsedType, err := ParseInstanceType(instanceType) + if err != nil { + return false, err + } + + return parsedType.Family == "inf", nil +} + func IsMacInstance(instanceType string) (bool, error) { parsedType, err := ParseInstanceType(instanceType) if err != nil { diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go index 0a530fed0b..a0b5d646c3 100644 --- a/pkg/types/clusterconfig/cluster_config.go +++ b/pkg/types/clusterconfig/cluster_config.go @@ -55,10 +55,8 @@ const ( ) var ( - _operatorNodeGroupInstanceType = "t3.medium" - _operatorNodeGroupRequiredOnDemand = int64(25) - _prometheusNodeGroupInstanceType = "t3.xlarge" - _prometheusNodeGroupRequiredOnDemand = int64(1) + _operatorNodeGroupInstanceType = "t3.medium" + _operatorNodeGroupRequiredOnDemand = int64(25) _maxNodeGroupLengthWithPrefix = 32 _maxNodeGroupLength = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws- @@ -88,8 +86,9 @@ type CoreConfig struct { IstioNamespace string `json:"istio_namespace" yaml:"istio_namespace"` // User-specifiable fields - ClusterName string `json:"cluster_name" yaml:"cluster_name"` - Region string `json:"region" yaml:"region"` + ClusterName string `json:"cluster_name" yaml:"cluster_name"` + Region string `json:"region" yaml:"region"` + PrometheusInstanceType string `json:"prometheus_instance_type" yaml:"prometheus_instance_type"` // User-specifiable fields ImageOperator string `json:"image_operator" yaml:"image_operator"` @@ -336,6 +335,14 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{ Validator: RegionValidator, }, }, + { + StructField: "PrometheusInstanceType", + StringValidation: &cr.StringValidation{ + MinLength: 1, + Default: "t3.medium", + Validator: validatePrometheusInstanceType, + }, + }, { StructField: "Telemetry", BoolValidation: &cr.BoolValidation{ @@ -915,8 +922,8 @@ func (cc *Config) validate(awsClient *aws.Client) error { RequiredOnDemandInstances: int64(_operatorNodeGroupRequiredOnDemand), }, { - InstanceType: _prometheusNodeGroupInstanceType, - RequiredOnDemandInstances: int64(_prometheusNodeGroupRequiredOnDemand), + InstanceType: cc.PrometheusInstanceType, + RequiredOnDemandInstances: 1, }, } for _, nodeGroup := range cc.NodeGroups { @@ -1486,6 +1493,31 @@ func validateInstanceType(instanceType string) (string, error) { return instanceType, nil } +func validatePrometheusInstanceType(instanceType string) (string, error) { + _, err := validateInstanceType(instanceType) + if err != nil { + return "", err + } + + isNvidiaGPU, err := aws.IsNvidiaGPUInstance(instanceType) + if err != nil { + return "", err + } + if isNvidiaGPU { + return "", ErrorNvidiaGPUInstancesNotSupported(instanceType) + } + + isInf, err := aws.IsInferentiaInstance(instanceType) + if err != nil { + return "", err + } + if isInf { + return "", ErrorInferentiaInstancesNotSupported(instanceType) + } + + return instanceType, nil +} + func validateInstanceDistribution(instances []string) ([]string, error) { for _, instance := range instances { _, err := validateInstanceType(instance) @@ -1627,6 +1659,7 @@ func (cc *CoreConfig) TelemetryEvent() map[string]interface{} { } event["region"] = cc.Region + event["prometheus_instance_type"] = cc.PrometheusInstanceType if !strings.HasPrefix(cc.ImageOperator, "cortexlabs/") { event["image_operator._is_custom"] = true diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go index 19df34d3f2..2495cbdba1 100644 --- a/pkg/types/clusterconfig/config_key.go +++ b/pkg/types/clusterconfig/config_key.go @@ -22,6 +22,7 @@ const ( ClusterNameKey = "cluster_name" RegionKey = "region" + PrometheusInstanceTypeKey = "prometheus_instance_type" NodeGroupsKey = "node_groups" InstanceTypeKey = "instance_type" AcceleratorTypeKey = "accelerator_type" diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index b5126d94bc..45cb0d7632 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -46,6 +46,8 @@ const ( ErrSpotPriceGreaterThanMaxPrice = "clusterconfig.spot_price_greater_than_max_price" ErrInstanceTypeNotSupportedByCortex = "clusterconfig.instance_type_not_supported_by_cortex" ErrAMDGPUInstancesNotSupported = "clusterconfig.amd_gpu_instances_not_supported" + ErrNvidiaGPUInstancesNotSupported = "clusterconfig.nvidia_gpu_instance_not_supported" + ErrInferentiaInstancesNotSupported = "clusterconfig.inferentia_instances_not_supported" ErrMacInstancesNotSupported = "clusterconfig.mac_instances_not_supported" ErrAtLeastOneInstanceDistribution = "clusterconfig.at_least_one_instance_distribution" ErrNoCompatibleSpotInstanceFound = "clusterconfig.no_compatible_spot_instance_found" @@ -202,6 +204,20 @@ func ErrorAMDGPUInstancesNotSupported(instanceType string) error { }) } +func ErrorNvidiaGPUInstancesNotSupported(instanceType string) error { + return errors.WithStack(&errors.Error{ + Kind: ErrNvidiaGPUInstancesNotSupported, + Message: fmt.Sprintf("Nvidia GPU instances (including %s) are not supported by cortex", instanceType), + }) +} + +func ErrorInferentiaInstancesNotSupported(instanceType string) error { + return errors.WithStack(&errors.Error{ + Kind: ErrInferentiaInstancesNotSupported, + Message: fmt.Sprintf("Inferentia instances (including %s) are not supported by cortex", instanceType), + }) +} + func ErrorMacInstancesNotSupported(instanceType string) error { return errors.WithStack(&errors.Error{ Kind: ErrMacInstancesNotSupported, From 21a5d9ccf0279b8dd2bce1eaab140690dba4a608 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Fri, 2 Jul 2021 23:00:20 +0300 Subject: [PATCH 22/25] Nits --- docs/clusters/management/create.md | 2 +- pkg/types/clusterconfig/cluster_config.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md index 8bed03e645..c63332b302 100644 --- a/docs/clusters/management/create.md +++ b/docs/clusters/management/create.md @@ -93,7 +93,7 @@ iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"] vpc_cidr: 192.168.0.0/16 # instance type for prometheus -# use a bigger instance if the cluster is expected to grow past the thousand node (or api replica) threshold +# use a bigger instance if the cluster is expected to grow past the thousand node (or api replica) mark prometheus_instance_type: "t3.medium" ``` diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go index a0b5d646c3..c4647e8559 100644 --- a/pkg/types/clusterconfig/cluster_config.go +++ b/pkg/types/clusterconfig/cluster_config.go @@ -55,8 +55,8 @@ const ( ) var ( - _operatorNodeGroupInstanceType = "t3.medium" - _operatorNodeGroupRequiredOnDemand = int64(25) + _operatorNodeGroupInstanceType = "t3.medium" + _operatorNodeGroupMaxRequiredOnDemand = int64(25) _maxNodeGroupLengthWithPrefix = 32 _maxNodeGroupLength = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws- @@ -919,7 +919,7 @@ func (cc *Config) validate(awsClient *aws.Client) error { instances := []aws.InstanceTypeRequests{ { InstanceType: _operatorNodeGroupInstanceType, - RequiredOnDemandInstances: int64(_operatorNodeGroupRequiredOnDemand), + RequiredOnDemandInstances: int64(_operatorNodeGroupMaxRequiredOnDemand), }, { InstanceType: cc.PrometheusInstanceType, From 65781cb386bb37b43a8743ba7c622af74d44cbf2 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Fri, 2 Jul 2021 13:30:56 -0700 Subject: [PATCH 23/25] Update create.md --- docs/clusters/management/create.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md index c63332b302..b65e2dfa84 100644 --- a/docs/clusters/management/create.md +++ b/docs/clusters/management/create.md @@ -92,8 +92,7 @@ iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"] # primary CIDR block for the cluster's VPC vpc_cidr: 192.168.0.0/16 -# instance type for prometheus -# use a bigger instance if the cluster is expected to grow past the thousand node (or api replica) mark +# instance type for prometheus (use a larger instance for clusters exceeding 500 nodes) prometheus_instance_type: "t3.medium" ``` From dcff049fb0c6ecb8131f7f62dca245f220d6a294 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 3 Jul 2021 00:31:43 +0300 Subject: [PATCH 24/25] Address some PR comments --- cli/cmd/cluster.go | 2 +- cli/cmd/lib_cluster_config.go | 2 +- manager/manifests/event-exporter.yaml | 4 ++++ manager/manifests/istio.yaml.j2 | 2 +- manager/manifests/metrics-server.yaml | 7 +++++-- 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 7d87bf7d3f..375f1ca0d8 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -907,7 +907,7 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice)) rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)}) - rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (cortex system)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) + rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"}) if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go index f2e47e71e6..cba8007dd2 100644 --- a/cli/cmd/lib_cluster_config.go +++ b/cli/cmd/lib_cluster_config.go @@ -230,7 +230,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient maxOperatorNodeGroupPrice := 25 * (operatorInstancePrice + operatorEBSPrice) prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice rows = append(rows, []interface{}{"1-25 t3.medium instances (cortex system)", fmt.Sprintf("%s - %s (depending on load)", s.DollarsAndTenthsOfCents(minOperatorNodeGroupPrice), s.DollarsAndTenthsOfCents(maxOperatorNodeGroupPrice))}) - rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (cortex system)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) + rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"}) if clusterConfig.NATGateway == clusterconfig.SingleNATGateway { diff --git a/manager/manifests/event-exporter.yaml b/manager/manifests/event-exporter.yaml index ae0957d9aa..ab4847c4a7 100644 --- a/manager/manifests/event-exporter.yaml +++ b/manager/manifests/event-exporter.yaml @@ -82,6 +82,10 @@ spec: volumeMounts: - mountPath: /data name: event-exporter-config + resources: + requests: + cpu: 20m + memory: 50Mi volumes: - name: event-exporter-config configMap: diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2 index 7f01fb2529..0355896a89 100644 --- a/manager/manifests/istio.yaml.j2 +++ b/manager/manifests/istio.yaml.j2 @@ -128,7 +128,7 @@ spec: targetPort: 15443 resources: requests: - cpu: 300m + cpu: 512m memory: 128Mi limits: cpu: 1500m diff --git a/manager/manifests/metrics-server.yaml b/manager/manifests/metrics-server.yaml index bdaad365d9..6e4e0de8bb 100644 --- a/manager/manifests/metrics-server.yaml +++ b/manager/manifests/metrics-server.yaml @@ -174,8 +174,11 @@ spec: periodSeconds: 10 resources: requests: - cpu: 100m - memory: 200Mi + cpu: 50m + memory: 100Mi + limits: + cpu: 200m + memory: 500Mi securityContext: readOnlyRootFilesystem: true runAsNonRoot: true From f64d0facf2f67a8ed52f1f4b4414facdc22f74ce Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 3 Jul 2021 01:24:12 +0300 Subject: [PATCH 25/25] Address PR comments --- cli/cmd/lib_cluster_config.go | 12 +++++------- manager/manifests/istio.yaml.j2 | 2 +- pkg/types/clusterconfig/cluster_config.go | 11 +++++------ pkg/types/clusterconfig/errors.go | 10 +++++----- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go index cba8007dd2..1da6de2f7a 100644 --- a/cli/cmd/lib_cluster_config.go +++ b/cli/cmd/lib_cluster_config.go @@ -186,10 +186,9 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)}) ngNameToSpotInstancesUsed := map[string]int{} - baseMinPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice - baseMaxPrice := eksPrice + 25*(operatorInstancePrice+operatorEBSPrice) + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice - totalMinPrice := baseMinPrice - totalMaxPrice := baseMaxPrice + fixedPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice + totalMinPrice := fixedPrice + totalMaxPrice := fixedPrice for _, ng := range clusterConfig.NodeGroups { apiInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][ng.InstanceType].Price apiEBSPrice := aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceGB * float64(ng.InstanceVolumeSize) / 30 / 24 @@ -226,10 +225,9 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr}) } - minOperatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice - maxOperatorNodeGroupPrice := 25 * (operatorInstancePrice + operatorEBSPrice) + operatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice - rows = append(rows, []interface{}{"1-25 t3.medium instances (cortex system)", fmt.Sprintf("%s - %s (depending on load)", s.DollarsAndTenthsOfCents(minOperatorNodeGroupPrice), s.DollarsAndTenthsOfCents(maxOperatorNodeGroupPrice))}) + rows = append(rows, []interface{}{"1 t3.medium instance (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)}) rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)}) rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"}) diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2 index 0355896a89..5b4316ca8e 100644 --- a/manager/manifests/istio.yaml.j2 +++ b/manager/manifests/istio.yaml.j2 @@ -163,7 +163,7 @@ spec: gateways: istio-ingressgateway: runAsRoot: true - autoscaleEnabled: false + autoscaleEnabled: true secretVolumes: - name: customgateway-certs secretName: istio-customgateway-certs diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go index ca850ed6f7..93beb9a32c 100644 --- a/pkg/types/clusterconfig/cluster_config.go +++ b/pkg/types/clusterconfig/cluster_config.go @@ -56,8 +56,7 @@ const ( ) var ( - _operatorNodeGroupInstanceType = "t3.medium" - _operatorNodeGroupMaxRequiredOnDemand = int64(25) + _operatorNodeGroupInstanceType = "t3.medium" _maxNodeGroupLengthWithPrefix = 32 _maxNodeGroupLength = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws- @@ -921,7 +920,7 @@ func (cc *Config) validate(awsClient *aws.Client) error { instances := []aws.InstanceTypeRequests{ { InstanceType: _operatorNodeGroupInstanceType, - RequiredOnDemandInstances: int64(_operatorNodeGroupMaxRequiredOnDemand), + RequiredOnDemandInstances: 1, }, { InstanceType: cc.PrometheusInstanceType, @@ -1518,12 +1517,12 @@ func validatePrometheusInstanceType(instanceType string) (string, error) { return "", err } - isNvidiaGPU, err := aws.IsNvidiaGPUInstance(instanceType) + isGPU, err := aws.IsGPUInstance(instanceType) if err != nil { return "", err } - if isNvidiaGPU { - return "", ErrorNvidiaGPUInstancesNotSupported(instanceType) + if isGPU { + return "", ErrorGPUInstancesNotSupported(instanceType) } isInf, err := aws.IsInferentiaInstance(instanceType) diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index cbae233b8f..56d1be5115 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -46,7 +46,7 @@ const ( ErrSpotPriceGreaterThanMaxPrice = "clusterconfig.spot_price_greater_than_max_price" ErrInstanceTypeNotSupportedByCortex = "clusterconfig.instance_type_not_supported_by_cortex" ErrAMDGPUInstancesNotSupported = "clusterconfig.amd_gpu_instances_not_supported" - ErrNvidiaGPUInstancesNotSupported = "clusterconfig.nvidia_gpu_instance_not_supported" + ErrGPUInstancesNotSupported = "clusterconfig.gpu_instance_not_supported" ErrInferentiaInstancesNotSupported = "clusterconfig.inferentia_instances_not_supported" ErrMacInstancesNotSupported = "clusterconfig.mac_instances_not_supported" ErrAtLeastOneInstanceDistribution = "clusterconfig.at_least_one_instance_distribution" @@ -204,17 +204,17 @@ func ErrorAMDGPUInstancesNotSupported(instanceType string) error { }) } -func ErrorNvidiaGPUInstancesNotSupported(instanceType string) error { +func ErrorGPUInstancesNotSupported(instanceType string) error { return errors.WithStack(&errors.Error{ - Kind: ErrNvidiaGPUInstancesNotSupported, - Message: fmt.Sprintf("Nvidia GPU instances (including %s) are not supported by cortex", instanceType), + Kind: ErrGPUInstancesNotSupported, + Message: fmt.Sprintf("GPU instances (including %s) are not supported", instanceType), }) } func ErrorInferentiaInstancesNotSupported(instanceType string) error { return errors.WithStack(&errors.Error{ Kind: ErrInferentiaInstancesNotSupported, - Message: fmt.Sprintf("Inferentia instances (including %s) are not supported by cortex", instanceType), + Message: fmt.Sprintf("Inferentia instances (including %s) are not supported", instanceType), }) }