From cc1842f7d386ee34958f7d9a8c721d949deeb998 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Mon, 28 Jun 2021 23:46:36 +0300
Subject: [PATCH 01/25] Experiment with dropping metrics/labels

---
 .../manifests/prometheus-node-exporter.yaml   | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml
index c1a6b0b4a0..cc32089078 100644
--- a/manager/manifests/prometheus-node-exporter.yaml
+++ b/manager/manifests/prometheus-node-exporter.yaml
@@ -191,6 +191,59 @@ spec:
           sourceLabels:
             - __meta_kubernetes_pod_node_name
           targetLabel: instance
+      metricRelabelings:
+        - action: keeplabel
+          sourceLabels: [__name__, instance, job]
+          regex: "node_(\
+            cpu_seconds_total|\
+            load1|\
+            load5|\
+            load15|\
+            memory_MemTotal_bytes|\
+            memory_MemFree_bytes|\
+            memory_Buffers_bytes|\
+            memory_Cached_bytes|\
+            memory_MemAvailable_bytes|\
+            disk_read_bytes_total|\
+            disk_written_bytes_total|\
+            disk_io_time_seconds_total|\
+            disk_io_time_weighted_seconds|\
+            filesystem_size_bytes|\
+            filesystem_avail_bytes|\
+            network_receive_bytes_total|\
+            network_transmit_bytes_total|\
+            network_receive_drop_total|\
+            network_transmit_drop_total\
+            );(.+)"
+        - action: keep
+          sourceLabels: [__name__, device]
+          regex: "node_(\
+            disk_read_bytes_total|\
+            disk_written_bytes_total|\
+            disk_io_time_seconds_total|\
+            disk_io_time_weighted_seconds|\
+            filesystem_size_bytes|\
+            filesystem_avail_bytes|\
+            network_receive_bytes_total|\
+            network_transmit_bytes_total|\
+            network_receive_drop_total|\
+            network_transmit_drop_total\
+            );(.+)"
+        - action: keep
+          sourceLabels: [__name__, fstype]
+          regex: "node_(\
+            node_filesystem_size_bytes|\
+            node_filesystem_avail_bytes\
+            );(.+)"
+        - action: keep
+          sourceLabels: [__name__, mountpoint]
+          regex: (node_filesystem_size_bytes);(.+)
+        - action: keep
+          sourceLabels: [__name__, mode]
+          regex: (node_cpu_seconds_total);(.+)
+        - action: keep
+          sourceLabels: [__name__]
+          regex: node_vmstat_pgmajfault
       scheme: https
       tlsConfig:
         insecureSkipVerify: true

From ee7c31a77ab8e8341ad147ccde7d7b978ee8b2a9 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Mon, 28 Jun 2021 23:46:52 +0300
Subject: [PATCH 02/25] Fix the dropping of metrics/labels for the node
 exporter

---
 .../manifests/prometheus-node-exporter.yaml   | 38 +++----------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml
index cc32089078..4a5ec20cbc 100644
--- a/manager/manifests/prometheus-node-exporter.yaml
+++ b/manager/manifests/prometheus-node-exporter.yaml
@@ -192,8 +192,8 @@ spec:
             - __meta_kubernetes_pod_node_name
           targetLabel: instance
       metricRelabelings:
-        - action: keeplabel
-          sourceLabels: [__name__, instance, job]
+        - action: keep
+          sourceLabels: [__name__]
           regex: "node_(\
             cpu_seconds_total|\
             load1|\
@@ -214,36 +214,10 @@ spec:
             network_transmit_bytes_total|\
             network_receive_drop_total|\
             network_transmit_drop_total\
-            );(.+)"
-        - action: keep
-          sourceLabels: [__name__, device]
-          regex: "node_(\
-            disk_read_bytes_total|\
-            disk_written_bytes_total|\
-            disk_io_time_seconds_total|\
-            disk_io_time_weighted_seconds|\
-            filesystem_size_bytes|\
-            filesystem_avail_bytes|\
-            network_receive_bytes_total|\
-            network_transmit_bytes_total|\
-            network_receive_drop_total|\
-            network_transmit_drop_total\
-            );(.+)"
-        - action: keep
-          sourceLabels: [__name__, fstype]
-          regex: "node_(\
-            node_filesystem_size_bytes|\
-            node_filesystem_avail_bytes\
-            );(.+)"
-        - action: keep
-          sourceLabels: [__name__, mountpoint]
-          regex: (node_filesystem_size_bytes);(.+)
-        - action: keep
-          sourceLabels: [__name__, mode]
-          regex: (node_cpu_seconds_total);(.+)
-        - action: keep
-          sourceLabels: [__name__]
-          regex: node_vmstat_pgmajfault
+            node_vmstat_pgmajfault\
+            )"
+        - action: labelkeep
+        regex: (__name__|instance|job|device|fstype|mountpoint|mode)
       scheme: https
       tlsConfig:
         insecureSkipVerify: true

From 4ed459ed09fa0014a222a2066eefa5eae5a22773 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Tue, 29 Jun 2021 00:41:10 +0300
Subject: [PATCH 03/25] Drop unnecessary metrics/labels from kubelet

---
 .../prometheus-kubelet-exporter.yaml          | 47 +++++--------------
 1 file changed, 12 insertions(+), 35 deletions(-)

diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml
index 8677500ef9..8982706c42 100644
--- a/manager/manifests/prometheus-kubelet-exporter.yaml
+++ b/manager/manifests/prometheus-kubelet-exporter.yaml
@@ -27,37 +27,7 @@ spec:
     interval: 30s
     metricRelabelings:
     - action: drop
-      regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: transformation_(transformation_latencies_microseconds|failures_total)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
-      sourceLabels:
-      - __name__
+      sourceLabels: [__name__]
     port: https-metrics
     relabelings:
     - sourceLabels:
@@ -71,10 +41,14 @@ spec:
     honorTimestamps: false
     interval: 30s
     metricRelabelings:
-    - action: drop
-      regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
-      sourceLabels:
-      - __name__
+    - action: keep
+      sourceLabels: [__name__]
+      regex: "container_(\
+        cpu_usage_seconds_total|\
+        memory_working_set_bytes\
+        )"
+    - action: labelkeep
+      regex: (__name__|pod|container|name)
     path: /metrics/cadvisor
     port: https-metrics
     relabelings:
@@ -93,6 +67,9 @@ spec:
     - sourceLabels:
       - __metrics_path__
       targetLabel: metrics_path
+    metricRelabelings:
+    - action: drop
+      sourceLabels: [__name__]
     scheme: https
     tlsConfig:
       insecureSkipVerify: true

From 146c4e858b75601cbaedff3172e194d8355e754a Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Tue, 29 Jun 2021 00:52:21 +0300
Subject: [PATCH 04/25] Drop unnecessary kube-state-metrics metrics/labels

---
 manager/manifests/prometheus-kube-state-metrics.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml
index 135e5ffdf8..ffa7f92b31 100644
--- a/manager/manifests/prometheus-kube-state-metrics.yaml
+++ b/manager/manifests/prometheus-kube-state-metrics.yaml
@@ -270,6 +270,18 @@ spec:
       scheme: http
       path: /metrics
       interval: 30s
+      metricRelabelings:
+      - action: keep
+        sourceLabels: [__name__]
+        regex: "kube_(\
+          pod_container_resource_requests_cpu_cores|\
+          pod_container_resource_requests_memory_bytes|\
+          pod_info|\
+          deployment_status_replicas_available|\
+          job_status_active\
+          )"
+      - action: labelkeep
+        regex: (__name__|exported_pod|job_name)
   namespaceSelector:
     any: true
   selector:

From 2f2649743932ed6333599f5721b527252ee7a2a6 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Tue, 29 Jun 2021 00:55:30 +0300
Subject: [PATCH 05/25] Drop unnecessary metrics/labels from DCGM exporter

---
 manager/manifests/prometheus-dcgm-exporter.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml
index 3db31338ce..a025b1bc55 100644
--- a/manager/manifests/prometheus-dcgm-exporter.yaml
+++ b/manager/manifests/prometheus-dcgm-exporter.yaml
@@ -131,6 +131,16 @@ spec:
       path: /metrics
       scheme: http
       interval: 15s
+      metricRelabelings:
+      - action: keep
+        sourceLabels: [__name__]
+        regex: "DCGM_FI_DEV_(\
+          GPU_UTIL|\
+          FB_USED|\
+          FB_FREE\
+          )"
+      - action: labelkeep
+        regex: (__name__|exported_pod)
   namespaceSelector:
     any: true
   selector:

From 39a639ad17a525f8bd5e950bcbcf3e664c1cabba Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Tue, 29 Jun 2021 15:33:44 +0300
Subject: [PATCH 06/25] Remove unnecessary metrics/labels from istio

---
 manager/manifests/prometheus-monitoring.yaml.j2 | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index fa9aefe277..37e1b57da9 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -160,6 +160,18 @@ spec:
         - sourceLabels: [ __meta_kubernetes_pod_name ]
           action: replace
           targetLabel: pod_name
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "istio_(\
+            requests_total|\
+            request_duration_milliseconds_bucket|\
+            request_duration_milliseconds_sum|\
+            request_duration_milliseconds_count\
+            )"
+        - action: labeldrop
+          sourceLabels: [__name__]
+          regex: (__name__|destination_service_name|response_code)
 
 ---
 

From d4739a08dad59a0e7e401ac669d447ddfeaa1422 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Tue, 29 Jun 2021 16:03:53 +0300
Subject: [PATCH 07/25] Change labeldrop to labelkeep

---
 manager/manifests/prometheus-monitoring.yaml.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index 37e1b57da9..3fc137b897 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -169,7 +169,7 @@ spec:
             request_duration_milliseconds_sum|\
             request_duration_milliseconds_count\
             )"
-        - action: labeldrop
+        - action: labelkeep
           sourceLabels: [__name__]
           regex: (__name__|destination_service_name|response_code)
 

From 488af33ded3d19d1caecea101f4869839e8a430e Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Tue, 29 Jun 2021 16:38:52 +0300
Subject: [PATCH 08/25] Add development docs

---
 dev/prometheus.md | 207 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 207 insertions(+)
 create mode 100644 dev/prometheus.md

diff --git a/dev/prometheus.md b/dev/prometheus.md
new file mode 100644
index 0000000000..7be8e0834e
--- /dev/null
+++ b/dev/prometheus.md
@@ -0,0 +1,207 @@
+# Metrics
+
+## Updating metrics
+
+When new metrics/labels/exporters are added to be scraped by prometheus, make sure the following list **is updated** as well to keep track of what metrics/labels are needed or not.
+
+The following is a list of metrics that are currently in use.
+
+#### Cortex metrics
+
+1. cortex_in_flight_requests with the following labels:
+    1. api_name
+1. cortex_async_request_count with the following labels:
+    1. api_name
+    1. api_kind
+    1. status_code
+1. cortex_async_queue_length with the following labels:
+    1. api_name
+    1. api_kind
+1. cortex_async_latency_bucket with the following labels:
+    1. api_name
+    1. api_kind
+1. cortex_batch_succeeded with the following labels:
+    1. api_name
+1. cortex_batch_failed with the following labels:
+    1. api_name
+1. cortex_time_per_batch_sum with the following labels:
+    1. api_name
+1. cortex_time_per_batch_count with the following labels:
+    1. api_name
+
+#### Istio metrics
+
+1. istio_requests_total with the following labels:
+    1. destination_service_name
+    1. response_code
+1. istio_request_duration_milliseconds_bucket with the following labels:
+    1. destination_service_name
+1. istio_request_duration_milliseconds_sum with the following labels:
+    1. destination_service_name
+1. istio_request_duration_milliseconds_count with the following labels:
+    1. destination_service_name
+
+#### Kubelet metrics
+1. container_cpu_usage_seconds_total with the following labels:
+    1. pod
+    1. container
+    1. name
+1. container_memory_working_set_bytes with the following labels:
+    1. pod
+    1. name
+    1. container
+
+#### Kube-state-metrics metrics
+
+1. kube_pod_container_resource_requests_cpu_cores with the following labels:
+    1. exported_pod
+1. kube_pod_container_resource_requests_memory_bytes with the following labels:
+    1. exported_pod
+1. kube_pod_info with the following labels:
+    1. exported_pod
+1. kube_deployment_status_replicas_available with the following labels:
+    1. deployment
+1. kube_job_status_active with the following labels:
+    1. job_name
+
+#### DCGM metrics
+
+1. DCGM_FI_DEV_GPU_UTIL with the following labels:
+    1. exported_pod
+1. DCGM_FI_DEV_FB_USED with the following labels:
+    1. exported_pod
+1. DCGM_FI_DEV_FB_FREE with the following labels:
+    1. exported_pod
+
+#### Node metrics
+
+1. node_cpu_seconds_total with the following labels:
+    1. job
+    1. mode
+    1. instance
+    1. cpu
+1. node_load1 with the following labels:
+    1. job
+    1. instance
+1. node_load5 with the following labels:
+    1. job
+    1. instance
+1. node_load15 with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemTotal_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemFree_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_Buffers_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_Cached_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemAvailable_bytes with the following labels:
+    1. job
+    1. instance
+1. node_disk_read_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_disk_written_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_disk_io_time_seconds_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_filesystem_size_bytes with the following labels:
+    1. job
+    1. instance
+    1. fstype
+    1. mountpoint
+    1. device
+1. node_filesystem_avail_bytes with the following labels:
+    1. job
+    1. instance
+    1. fstype
+    1. device
+1. node_network_receive_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_network_transmit_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+
+##### Prometheus rules for the node exporter
+
+1. instance:node_cpu_utilisation:rate1m from the following metrics:
+    1. node_cpu_seconds_total with the following labels:
+        1. job
+        1. mode
+1. instance:node_num_cpu:sum from the following metrics:
+    1. node_cpu_seconds_total with the following labels:
+        1. job
+1. instance:node_load1_per_cpu:ratio from the following metrics:
+    1. node_load1 with the following labels:
+        1. job
+1. instance:node_memory_utilisation:ratio from the following metrics:
+    1. node_memory_MemTotal_bytes with the following labels:
+        1. job
+    1. node_memory_MemAvailable_bytes with the following labels:
+        1. job
+1. instance:node_vmstat_pgmajfault:rate1m with the following metrics:
+    1. node_vmstat_pgmajfault with the following labels:
+        1. job
+1. instance_device:node_disk_io_time_seconds:rate1m with the following metrics:
+    1. node_disk_io_time_seconds_total with the following labels:
+        1. job
+        1. device
+1. instance_device:node_disk_io_time_weighted_seconds:rate1m with the following metrics:
+    1. node_disk_io_time_weighted_seconds with the following labels:
+        1. job
+        1. device
+1. instance:node_network_receive_bytes_excluding_lo:rate1m with the following metrics:
+    1. node_network_receive_bytes_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_transmit_bytes_excluding_lo:rate1m with the following metrics:
+    1. node_network_transmit_bytes_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_receive_drop_excluding_lo:rate1m with the following metrics:
+    1. node_network_receive_drop_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_transmit_drop_excluding_lo:rate1m with the following metrics:
+    1. node_network_transmit_drop_total with the following labels:
+        1. job
+        1. device
+
+## Re-introducing dropped metrics/labels
+
+If you need to add some metrics/labels back for some particular use case, comment out every `metricRelabelings:` section (except the one from the `prometheus-operator.yaml` file), determine which metrics/labels you want to add back (i.e. by using the explorer from Grafana) and then re-edit the appropriate `metricRelabelings:` sections to account for the un-dropped metrics/labels.
+
+## Prometheus Analysis
+
+### Go Pprof
+
+To analyse the memory allocations of prometheus, run `kubectl port-forward prometheus-prometheus-0 9090:9090`, and then run `go tool pprof -symbolize=remote -inuse_space localhost:9090/debug/pprof/heap`. Once you get the interpreter, you can run `top` or `dot` for a more detailed hierarchy of the memory usage.
+
+### TSDB
+
+To analyse the TSDB of prometheus, exec into the `prometheus-prometheus-0` pod, `cd` into `/tmp`, and run the following code-block:
+
+```bash
+wget https://github.com/prometheus/prometheus/releases/download/v1.7.3/prometheus-1.7.3.linux-amd64.tar.gz
+tar -xzf prometheus-*
+cd prometheus-*
+./tsdb analyze /prometheus | less
+```
+
+*Useful link: https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality*
+
+Or you can go to `localhost:9090` -> `Status` -> `TSDB Status`, but it's not as complete as running a binary analysis.

From ccc80ce693f825d9a0511b73ac96ed8e6f3b00e4 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Tue, 29 Jun 2021 19:31:19 +0300
Subject: [PATCH 09/25] Fixes for node-exporter & prom monitoring

---
 manager/manifests/prometheus-monitoring.yaml.j2 | 1 -
 manager/manifests/prometheus-node-exporter.yaml | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index 3fc137b897..6ceff425cc 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -170,7 +170,6 @@ spec:
             request_duration_milliseconds_count\
             )"
         - action: labelkeep
-          sourceLabels: [__name__]
           regex: (__name__|destination_service_name|response_code)
 
 ---
diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml
index 4a5ec20cbc..6506572163 100644
--- a/manager/manifests/prometheus-node-exporter.yaml
+++ b/manager/manifests/prometheus-node-exporter.yaml
@@ -217,7 +217,7 @@ spec:
             node_vmstat_pgmajfault\
             )"
         - action: labelkeep
-        regex: (__name__|instance|job|device|fstype|mountpoint|mode)
+          regex: (__name__|instance|job|device|fstype|mountpoint|mode)
       scheme: https
       tlsConfig:
         insecureSkipVerify: true

From b382a1f6903e6bbe6e2c71a88bf77e8a5fcd5b9c Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Tue, 29 Jun 2021 20:01:30 +0300
Subject: [PATCH 10/25] Fixes to the nodes dashboard

---
 dev/prometheus.md                               | 3 +++
 manager/manifests/prometheus-node-exporter.yaml | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/dev/prometheus.md b/dev/prometheus.md
index 7be8e0834e..a96ad04723 100644
--- a/dev/prometheus.md
+++ b/dev/prometheus.md
@@ -89,6 +89,9 @@ The following is a list of metrics that are currently in use.
 1. node_load15 with the following labels:
     1. job
     1. instance
+1. node_exporter_build_info with the following labels:
+    1. job
+    1. instance
 1. node_memory_MemTotal_bytes with the following labels:
     1. job
     1. instance
diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml
index 6506572163..27fc75c3c8 100644
--- a/manager/manifests/prometheus-node-exporter.yaml
+++ b/manager/manifests/prometheus-node-exporter.yaml
@@ -199,6 +199,7 @@ spec:
             load1|\
             load5|\
             load15|\
+            exporter_build_info|\
             memory_MemTotal_bytes|\
             memory_MemFree_bytes|\
             memory_Buffers_bytes|\
@@ -207,14 +208,14 @@ spec:
             disk_read_bytes_total|\
             disk_written_bytes_total|\
             disk_io_time_seconds_total|\
-            disk_io_time_weighted_seconds|\
+            disk_io_time_weighted_seconds_total|\
             filesystem_size_bytes|\
             filesystem_avail_bytes|\
             network_receive_bytes_total|\
             network_transmit_bytes_total|\
             network_receive_drop_total|\
-            network_transmit_drop_total\
-            node_vmstat_pgmajfault\
+            network_transmit_drop_total|\
+            vmstat_pgmajfault\
             )"
         - action: labelkeep
           regex: (__name__|instance|job|device|fstype|mountpoint|mode)

From 6527b4093f3f494cfedba0bf9dd0c3b408a49140 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Wed, 30 Jun 2021 03:08:26 +0300
Subject: [PATCH 11/25] Add missing `le` label for
 `istio_request_duration_milliseconds_bucket` metric

---
 dev/prometheus.md                               | 1 +
 manager/manifests/prometheus-monitoring.yaml.j2 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/prometheus.md b/dev/prometheus.md
index a96ad04723..33eb67937e 100644
--- a/dev/prometheus.md
+++ b/dev/prometheus.md
@@ -36,6 +36,7 @@ The following is a list of metrics that are currently in use.
     1. response_code
 1. istio_request_duration_milliseconds_bucket with the following labels:
     1. destination_service_name
+    1. le
 1. istio_request_duration_milliseconds_sum with the following labels:
     1. destination_service_name
 1. istio_request_duration_milliseconds_count with the following labels:
diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index 6ceff425cc..6cf1b7c19b 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -170,7 +170,7 @@ spec:
             request_duration_milliseconds_count\
             )"
         - action: labelkeep
-          regex: (__name__|destination_service_name|response_code)
+          regex: (__name__|destination_service_name|response_code|le)
 
 ---
 

From 35c5bb7d3024bffb79d118f52ff3e47a8d2d4dd5 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Wed, 30 Jun 2021 16:26:31 +0300
Subject: [PATCH 12/25] Add required label for kube-state-metrics exporter

---
 dev/prometheus.md                                    | 2 ++
 manager/manifests/prometheus-kube-state-metrics.yaml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/dev/prometheus.md b/dev/prometheus.md
index 33eb67937e..9330591c4b 100644
--- a/dev/prometheus.md
+++ b/dev/prometheus.md
@@ -56,8 +56,10 @@ The following is a list of metrics that are currently in use.
 
 1. kube_pod_container_resource_requests_cpu_cores with the following labels:
     1. exported_pod
+    1. exported_container (required for not dropping the values for each container of each pod)
 1. kube_pod_container_resource_requests_memory_bytes with the following labels:
     1. exported_pod
+    1. exported_container (required for not dropping the values for each container of each pod)
 1. kube_pod_info with the following labels:
     1. exported_pod
 1. kube_deployment_status_replicas_available with the following labels:
diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml
index ffa7f92b31..36e542ae22 100644
--- a/manager/manifests/prometheus-kube-state-metrics.yaml
+++ b/manager/manifests/prometheus-kube-state-metrics.yaml
@@ -281,7 +281,7 @@ spec:
           job_status_active\
           )"
       - action: labelkeep
-        regex: (__name__|exported_pod|job_name)
+        regex: (__name__|exported_pod|exported_container|job_name)
   namespaceSelector:
     any: true
   selector:

From 6ba99b9b644d3c505a13f9f0026361eee71fed19 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Wed, 30 Jun 2021 22:03:49 +0300
Subject: [PATCH 13/25] Fix batch grafana dashboard

---
 manager/manifests/grafana/grafana-dashboard-batch.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml
index 565513d0c4..b13ad328af 100644
--- a/manager/manifests/grafana/grafana-dashboard-batch.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml
@@ -36,7 +36,8 @@ data:
       "editable": true,
       "gnetId": null,
       "graphTooltip": 0,
-      "iteration": 1617119656448,
+      "id": 4,
+      "iteration": 1625068140971,
       "links": [],
       "panels": [
         {
@@ -427,13 +428,13 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"})",
+              "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"} != 0)",
               "interval": "",
               "legendFormat": "Active Jobs",
               "refId": "Active Batches"
             },
             {
-              "expr": "sum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(kube_job_status_active{job_name=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Active Workers",

From 416bed95f86b4a2979ed3ab2df0b6befcc4bac5f Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 16:35:10 +0300
Subject: [PATCH 14/25] Keep cortex_* metrics

---
 manager/manifests/prometheus-monitoring.yaml.j2 | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index 7b05b1cd2e..c25c084f1a 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -220,6 +220,10 @@ spec:
         - sourceLabels: [ __meta_kubernetes_pod_name ]
           action: replace
           targetLabel: pod_name
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "cortex_(.+)"
 
 ---
 
@@ -270,6 +274,10 @@ spec:
         - sourceLabels: [ __meta_kubernetes_pod_name ]
           action: replace
           targetLabel: pod_name
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "cortex_(.+)"
 
 ---
 
@@ -287,6 +295,10 @@ spec:
       scheme: http
       path: /metrics
       interval: 20s
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "cortex_(.+)"
   namespaceSelector:
     any: true
   selector:
@@ -309,6 +321,10 @@ spec:
       scheme: http
       path: /metrics
       interval: 10s
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "cortex_(.+)"
   namespaceSelector:
     any: true
   selector:

From f465bc111a66ee3661b3696d1c9754ff3c83c79c Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 17:14:25 +0300
Subject: [PATCH 15/25] Separate prometheus and operator workloads

---
 manager/generate_eks.py                       | 26 ++++++++++++++++---
 manager/manifests/cluster-autoscaler.yaml.j2  |  6 ++---
 manager/manifests/fluent-bit.yaml.j2          |  3 +++
 manager/manifests/grafana/grafana.yaml        |  6 +++++
 manager/manifests/istio.yaml.j2               | 17 +++++++-----
 manager/manifests/operator.yaml.j2            |  4 +--
 .../prometheus-kube-state-metrics.yaml        |  6 +++++
 .../manifests/prometheus-monitoring.yaml.j2   |  6 +++++
 .../manifests/prometheus-node-exporter.yaml   |  5 +++-
 manager/manifests/prometheus-operator.yaml    |  5 ++++
 .../manifests/prometheus-statsd-exporter.yaml |  6 +++++
 pkg/crds/config/manager/manager.yaml          |  8 +++---
 12 files changed, 77 insertions(+), 21 deletions(-)

diff --git a/manager/generate_eks.py b/manager/generate_eks.py
index 38d513f168..1ab0384c1e 100644
--- a/manager/generate_eks.py
+++ b/manager/generate_eks.py
@@ -305,13 +305,14 @@ def generate_eks(
         return
 
     operator_nodegroup = default_nodegroup(cluster_config)
+    # TODO validate requests when clustering up
     operator_settings = {
         "ami": get_ami(ami_map, "t3.medium"),
         "name": "cx-operator",
         "instanceType": "t3.medium",
-        "minSize": 2,
-        "maxSize": 2,
-        "desiredCapacity": 2,
+        "minSize": 1,
+        "maxSize": 25,
+        "desiredCapacity": 1,
         "volumeType": "gp3",
         "volumeSize": 20,
         "volumeIOPS": 3000,
@@ -319,6 +320,23 @@ def generate_eks(
     }
     operator_nodegroup = merge_override(operator_nodegroup, operator_settings)
 
+    prometheus_nodegroup = default_nodegroup(cluster_config)
+    prometheus_settings = {
+        "ami": get_ami(ami_map, "t3.xlarge"),
+        "name": "cx-prometheus",
+        "instanceType": "t3.xlarge",
+        "minSize": 1,
+        "maxSize": 1,
+        "desiredCapacity": 1,
+        "volumeType": "gp3",
+        "volumeSize": 20,
+        "volumeIOPS": 3000,
+        "volumeThroughput": 125,
+        "labels": {"prometheus": "true"},
+        "taints": {"prometheus": "true:NoSchedule"},
+    }
+    prometheus_nodegroup = merge_override(prometheus_nodegroup, prometheus_settings)
+
     worker_nodegroups = get_all_worker_nodegroups(ami_map, cluster_config)
 
     nat_gateway = "Disable"
@@ -337,7 +355,7 @@ def generate_eks(
             "tags": cluster_config["tags"],
         },
         "vpc": {"nat": {"gateway": nat_gateway}},
-        "nodeGroups": [operator_nodegroup] + worker_nodegroups,
+        "nodeGroups": [operator_nodegroup, prometheus_nodegroup] + worker_nodegroups,
         "addons": [
             {
                 "name": "vpc-cni",
diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2
index e529e36918..6c9ad79205 100644
--- a/manager/manifests/cluster-autoscaler.yaml.j2
+++ b/manager/manifests/cluster-autoscaler.yaml.j2
@@ -169,11 +169,11 @@ spec:
           name: cluster-autoscaler
           resources:
             limits:
-              cpu: 100m
-              memory: 300Mi
+              cpu: 300m
+              memory: 1Gi
             requests:
               cpu: 100m
-              memory: 300Mi
+              memory: 200Mi
           command:
             - ./cluster-autoscaler
             - --v=4
diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2
index a860659bc7..a2e1140f2c 100644
--- a/manager/manifests/fluent-bit.yaml.j2
+++ b/manager/manifests/fluent-bit.yaml.j2
@@ -249,3 +249,6 @@ spec:
         - key: workload
           operator: Exists
           effect: NoSchedule
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml
index 91f10b5a13..83a5b73d32 100644
--- a/manager/manifests/grafana/grafana.yaml
+++ b/manager/manifests/grafana/grafana.yaml
@@ -173,6 +173,12 @@ spec:
         - name: grafana-dashboard-nodes
           configMap:
             name: grafana-dashboard-nodes
+      nodeSelector:
+        prometheus: "true"
+      tolerations:
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
       affinity:
         podAffinity:
           preferredDuringSchedulingIgnoredDuringExecution:
diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2
index bf0a60af41..811bfbf1d2 100644
--- a/manager/manifests/istio.yaml.j2
+++ b/manager/manifests/istio.yaml.j2
@@ -25,8 +25,8 @@ spec:
       k8s:
         resources:
           requests:
-            cpu: 200m  # default is 500m
-            memory: 1.75Gi  # default is 2048Mi == 2Gi
+            cpu: 100m  # default is 500m
+            memory: 200Mi  # default is 2048Mi == 2Gi
     cni:
       enabled: false
     ingressGateways:
@@ -74,7 +74,7 @@ spec:
               cpu: 100m
               memory: 128Mi
             limits:
-              cpu: 2000m
+              cpu: 1000m
               memory: 1024Mi
           replicaCount: 1
           hpaSpec:
@@ -132,20 +132,23 @@ spec:
                 targetPort: 15443
           resources:
             requests:
-              cpu: 200m
+              cpu: 300m
               memory: 128Mi
             limits:
-              cpu: 2000m
+              cpu: 1500m
               memory: 1024Mi
           replicaCount: 1
           hpaSpec:
             minReplicas: 1
-            maxReplicas: 1  # edit autoscaleEnabled in values if increasing this
+            maxReplicas: 100  # edit autoscaleEnabled in values if increasing this
             metrics:
               - type: Resource
                 resource:
                   name: cpu
-                  targetAverageUtilization: 80
+                  targetAverageUtilization: 90
+                resource:
+                  name: mem
+                  targetAverageUtilization: 90
             scaleTargetRef:
               apiVersion: apps/v1
               kind: Deployment
diff --git a/manager/manifests/operator.yaml.j2 b/manager/manifests/operator.yaml.j2
index c5501cb1d8..3ccd8eea76 100644
--- a/manager/manifests/operator.yaml.j2
+++ b/manager/manifests/operator.yaml.j2
@@ -58,10 +58,10 @@ spec:
           imagePullPolicy: Always
           resources:
             requests:
-              cpu: 200m
+              cpu: 100m
               memory: 128Mi
             limits:
-              cpu: 2000m
+              cpu: 1500m
               memory: 1024Mi
           ports:
             - containerPort: 8888
diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml
index 95c995ef48..ba5165ff6e 100644
--- a/manager/manifests/prometheus-kube-state-metrics.yaml
+++ b/manager/manifests/prometheus-kube-state-metrics.yaml
@@ -234,6 +234,12 @@ spec:
             port: 8080
           initialDelaySeconds: 5
           timeoutSeconds: 5
+      nodeSelector:
+        prometheus: "true"
+      tolerations:
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index c25c084f1a..b0d76fbc34 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -30,6 +30,12 @@ metadata:
 spec:
   image: {{ config['image_prometheus'] }}
   serviceAccountName: prometheus
+  nodeSelector:
+    prometheus: "true"
+  tolerations:
+    - key: prometheus
+      operator: Exists
+      effect: NoSchedule
   podMonitorSelector:
     matchExpressions:
       - key: "monitoring.cortex.dev"
diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml
index b58e92a5d9..2ea1881eb7 100644
--- a/manager/manifests/prometheus-node-exporter.yaml
+++ b/manager/manifests/prometheus-node-exporter.yaml
@@ -153,12 +153,15 @@ spec:
       hostPID: true
       nodeSelector:
         kubernetes.io/os: linux
+        prometheus: "true"
       securityContext:
         runAsNonRoot: true
         runAsUser: 65534
       serviceAccountName: node-exporter
       tolerations:
-        - operator: Exists
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
       volumes:
         - hostPath:
             path: /sys
diff --git a/manager/manifests/prometheus-operator.yaml b/manager/manifests/prometheus-operator.yaml
index ddeff6e1c9..3b7b558318 100644
--- a/manager/manifests/prometheus-operator.yaml
+++ b/manager/manifests/prometheus-operator.yaml
@@ -14199,6 +14199,11 @@ spec:
           allowPrivilegeEscalation: false
       nodeSelector:
         kubernetes.io/os: linux
+        prometheus: "true"
+      tolerations:
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
       securityContext:
         runAsNonRoot: true
         runAsUser: 65534
diff --git a/manager/manifests/prometheus-statsd-exporter.yaml b/manager/manifests/prometheus-statsd-exporter.yaml
index b96a700ea6..ea58db52d8 100644
--- a/manager/manifests/prometheus-statsd-exporter.yaml
+++ b/manager/manifests/prometheus-statsd-exporter.yaml
@@ -74,6 +74,12 @@ spec:
           volumeMounts:
             - name: statsd-mapping-config
               mountPath: /etc/prometheus-statsd-exporter
+      nodeSelector:
+        prometheus: "true"
+      tolerations:
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
       volumes:
         - name: statsd-mapping-config
           configMap:
diff --git a/pkg/crds/config/manager/manager.yaml b/pkg/crds/config/manager/manager.yaml
index c1a9c7d25c..75f03b40d3 100644
--- a/pkg/crds/config/manager/manager.yaml
+++ b/pkg/crds/config/manager/manager.yaml
@@ -46,11 +46,11 @@ spec:
           periodSeconds: 10
         resources:
           limits:
-            cpu: 100m
-            memory: 30Mi
+            cpu: 200m
+            memory: 100Mi
           requests:
-            cpu: 100m
-            memory: 20Mi
+            cpu: 200m
+            memory: 80Mi
         volumeMounts:
           - mountPath: /mnt/cluster.yaml
             name: cluster-config

From 4289fc6f5e660101c36877c72f90c38d8cd6da57 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 17:26:47 +0300
Subject: [PATCH 16/25] Validate operator/prometheus node group quotas

---
 manager/generate_eks.py                   |  1 -
 pkg/lib/aws/servicequotas.go              |  4 ++--
 pkg/types/clusterconfig/cluster_config.go | 16 +++++++++++++++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/manager/generate_eks.py b/manager/generate_eks.py
index 1ab0384c1e..261a54d92b 100644
--- a/manager/generate_eks.py
+++ b/manager/generate_eks.py
@@ -305,7 +305,6 @@ def generate_eks(
         return
 
     operator_nodegroup = default_nodegroup(cluster_config)
-    # TODO validate requests when clustering up
     operator_settings = {
         "ami": get_ami(ami_map, "t3.medium"),
         "name": "cx-operator",
diff --git a/pkg/lib/aws/servicequotas.go b/pkg/lib/aws/servicequotas.go
index ef99e1787b..d1bc45b867 100644
--- a/pkg/lib/aws/servicequotas.go
+++ b/pkg/lib/aws/servicequotas.go
@@ -328,9 +328,9 @@ func requiredRulesForNodeGroupSecurityGroup(numAZs, whitelistLength int) int {
 }
 
 func requiredRulesForControlPlaneSecurityGroup(numNodeGroups int) int {
-	// +1 for the operator node group
+	// +2 for the operator and prometheus node groups
 	// this is the number of outbound rules (there are half as many inbound rules, so that is not the limiting factor)
-	return 2 * (numNodeGroups + 1)
+	return 2 * (numNodeGroups + 2)
 }
 
 func requiredSecurityGroups(numNodeGroups int, clusterAlreadyExists bool) int {
diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go
index 26e5bfc90f..0a530fed0b 100644
--- a/pkg/types/clusterconfig/cluster_config.go
+++ b/pkg/types/clusterconfig/cluster_config.go
@@ -55,6 +55,11 @@ const (
 )
 
 var (
+	_operatorNodeGroupInstanceType       = "t3.medium"
+	_operatorNodeGroupRequiredOnDemand   = int64(25)
+	_prometheusNodeGroupInstanceType     = "t3.xlarge"
+	_prometheusNodeGroupRequiredOnDemand = int64(1)
+
 	_maxNodeGroupLengthWithPrefix = 32
 	_maxNodeGroupLength           = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws-
 	_maxInstancePools             = 20
@@ -904,7 +909,16 @@ func (cc *Config) validate(awsClient *aws.Client) error {
 	}
 
 	ngNames := []string{}
-	instances := []aws.InstanceTypeRequests{}
+	instances := []aws.InstanceTypeRequests{
+		{
+			InstanceType:              _operatorNodeGroupInstanceType,
+			RequiredOnDemandInstances: int64(_operatorNodeGroupRequiredOnDemand),
+		},
+		{
+			InstanceType:              _prometheusNodeGroupInstanceType,
+			RequiredOnDemandInstances: int64(_prometheusNodeGroupRequiredOnDemand),
+		},
+	}
 	for _, nodeGroup := range cc.NodeGroups {
 		// setting max_instances to 0 during cluster creation is not permitted (but scaling max_instances to 0 afterwards is allowed)
 		if nodeGroup.MaxInstances == 0 {

From a5be71fb08ed8e1c1423483d044af09c916559f8 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 18:57:48 +0300
Subject: [PATCH 17/25] Address cluster info pricing

---
 cli/cmd/cluster.go             | 30 +++++++++------
 cli/cmd/lib_cluster_config.go  | 16 +++++---
 manager/generate_eks.py        |  1 +
 pkg/operator/endpoints/info.go | 69 ++++++++++++++++++++++++++++------
 pkg/operator/schema/schema.go  | 23 +++++++-----
 5 files changed, 102 insertions(+), 37 deletions(-)

diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index 1616565b71..cd30b8a75a 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -770,11 +770,12 @@ func cmdInfo(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, st
 			infoInterface = infoResponse.ClusterConfig.Config
 		} else {
 			infoInterface = map[string]interface{}{
-				"cluster_config":    infoResponse.ClusterConfig.Config,
-				"cluster_metadata":  infoResponse.ClusterConfig.OperatorMetadata,
-				"node_infos":        infoResponse.NodeInfos,
-				"endpoint_operator": operatorEndpoint,
-				"endpoint_api":      apiEndpoint,
+				"cluster_config":      infoResponse.ClusterConfig.Config,
+				"cluster_metadata":    infoResponse.ClusterConfig.OperatorMetadata,
+				"worker_node_infos":   infoResponse.WorkerNodeInfos,
+				"operator_node_infos": infoResponse.OperatorNodeInfos,
+				"endpoint_operator":   operatorEndpoint,
+				"endpoint_api":        apiEndpoint,
 			}
 		}
 
@@ -848,6 +849,8 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 	eksPrice := aws.EKSPrices[clusterConfig.Region]
 	operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
 	operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
+	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.xlarge"].Price
+	prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
 	nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
 	natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
@@ -891,17 +894,20 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 		totalNodeGroupsPrice += totalEBSPrice + totalInstancePrice
 	}
 
+	operatorNodeGroupPrice := float64(len(infoResponse.OperatorNodeInfos)) * (operatorInstancePrice + operatorEBSPrice)
+	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
+
 	var natTotalPrice float64
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
 		natTotalPrice = natUnitPrice
 	} else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway {
 		natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones))
 	}
-	totalPrice := eksPrice + totalNodeGroupsPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + nlbPrice*2 + natTotalPrice
+	totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice
 	fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))
 
-	operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
-	rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
+	rows = append(rows, []interface{}{"1 t3.xlarge instance (cortex system)", s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
@@ -919,11 +925,11 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 }
 
 func printInfoNodes(infoResponse *schema.InfoResponse) {
-	numAPIInstances := len(infoResponse.NodeInfos)
+	numAPIInstances := len(infoResponse.WorkerNodeInfos)
 
 	var totalReplicas int
 	var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
-	for _, nodeInfo := range infoResponse.NodeInfos {
+	for _, nodeInfo := range infoResponse.WorkerNodeInfos {
 		totalReplicas += nodeInfo.NumReplicas
 		if nodeInfo.ComputeUserCapacity.GPU > 0 {
 			doesClusterHaveGPUs = true
@@ -946,7 +952,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 
 	fmt.Printf(console.Bold("\nyour cluster has %d API %s running across %d %s%s\n"), totalReplicas, s.PluralS("replica", totalReplicas), numAPIInstances, s.PluralS("instance", numAPIInstances), pendingReplicasStr)
 
-	if len(infoResponse.NodeInfos) == 0 {
+	if len(infoResponse.WorkerNodeInfos) == 0 {
 		return
 	}
 
@@ -963,7 +969,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 	}
 
 	var rows [][]interface{}
-	for _, nodeInfo := range infoResponse.NodeInfos {
+	for _, nodeInfo := range infoResponse.WorkerNodeInfos {
 		lifecycle := "on-demand"
 		if nodeInfo.IsSpot {
 			lifecycle = "spot"
diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
index 97a9b86058..39a00a642a 100644
--- a/cli/cmd/lib_cluster_config.go
+++ b/cli/cmd/lib_cluster_config.go
@@ -163,7 +163,9 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
 func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) {
 	eksPrice := aws.EKSPrices[clusterConfig.Region]
 	operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
+	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.xlarge"].Price
 	operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
+	prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
 	nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
 	natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
@@ -184,9 +186,10 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 	rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})
 
 	ngNameToSpotInstancesUsed := map[string]int{}
-	fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + 2*nlbPrice + natTotalPrice
-	totalMinPrice := fixedPrice
-	totalMaxPrice := fixedPrice
+	baseMinPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
+	baseMaxPrice := eksPrice + 25*(operatorInstancePrice+operatorEBSPrice) + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
+	totalMinPrice := baseMinPrice
+	totalMaxPrice := baseMaxPrice
 	for _, ng := range clusterConfig.NodeGroups {
 		apiInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][ng.InstanceType].Price
 		apiEBSPrice := aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceGB * float64(ng.InstanceVolumeSize) / 30 / 24
@@ -223,8 +226,11 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 		rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr})
 	}
 
-	operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
-	rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
+	minOperatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice
+	maxOperatorNodeGroupPrice := 25 * (operatorInstancePrice + operatorEBSPrice)
+	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
+	rows = append(rows, []interface{}{"1-25 t3.medium instances (cortex system)", fmt.Sprintf("%s - %s (depending on load)", s.DollarsAndTenthsOfCents(minOperatorNodeGroupPrice), s.DollarsAndTenthsOfCents(maxOperatorNodeGroupPrice))})
+	rows = append(rows, []interface{}{"1 t3.xlarge instance (cortex system)", s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
diff --git a/manager/generate_eks.py b/manager/generate_eks.py
index 261a54d92b..89f81b7dce 100644
--- a/manager/generate_eks.py
+++ b/manager/generate_eks.py
@@ -316,6 +316,7 @@ def generate_eks(
         "volumeSize": 20,
         "volumeIOPS": 3000,
         "volumeThroughput": 125,
+        "labels": {"operator": "true"},
     }
     operator_nodegroup = merge_override(operator_nodegroup, operator_settings)
 
diff --git a/pkg/operator/endpoints/info.go b/pkg/operator/endpoints/info.go
index b83a6374e0..c210e74fe7 100644
--- a/pkg/operator/endpoints/info.go
+++ b/pkg/operator/endpoints/info.go
@@ -31,7 +31,13 @@ import (
 )
 
 func Info(w http.ResponseWriter, r *http.Request) {
-	nodeInfos, numPendingReplicas, err := getNodeInfos()
+	workerNodeInfos, numPendingReplicas, err := getWorkerNodeInfos()
+	if err != nil {
+		respondError(w, r, err)
+		return
+	}
+
+	operatorNodeInfos, err := getOperatorNodeInfos()
 	if err != nil {
 		respondError(w, r, err)
 		return
@@ -44,13 +50,14 @@ func Info(w http.ResponseWriter, r *http.Request) {
 
 	response := schema.InfoResponse{
 		ClusterConfig:      fullClusterConfig,
-		NodeInfos:          nodeInfos,
+		WorkerNodeInfos:    workerNodeInfos,
+		OperatorNodeInfos:  operatorNodeInfos,
 		NumPendingReplicas: numPendingReplicas,
 	}
 	respondJSON(w, r, response)
 }
 
-func getNodeInfos() ([]schema.NodeInfo, int, error) {
+func getWorkerNodeInfos() ([]schema.WorkerNodeInfo, int, error) {
 	pods, err := config.K8sAllNamspaces.ListPods(nil)
 	if err != nil {
 		return nil, 0, err
@@ -61,8 +68,8 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
 		return nil, 0, err
 	}
 
-	nodeInfoMap := make(map[string]*schema.NodeInfo, len(nodes)) // node name -> info
-	spotPriceCache := make(map[string]float64)                   // instance type -> spot price
+	nodeInfoMap := make(map[string]*schema.WorkerNodeInfo, len(nodes)) // node name -> info
+	spotPriceCache := make(map[string]float64)                         // instance type -> spot price
 
 	for i := range nodes {
 		node := nodes[i]
@@ -86,12 +93,14 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
 			}
 		}
 
-		nodeInfoMap[node.Name] = &schema.NodeInfo{
+		nodeInfoMap[node.Name] = &schema.WorkerNodeInfo{
+			NodeInfo: schema.NodeInfo{
+				NodeGroupName: nodeGroupName,
+				InstanceType:  instanceType,
+				IsSpot:        isSpot,
+				Price:         price,
+			},
 			Name:                 node.Name,
-			NodeGroupName:        nodeGroupName,
-			InstanceType:         instanceType,
-			IsSpot:               isSpot,
-			Price:                price,
 			NumReplicas:          0,                             // will be added to below
 			ComputeUserCapacity:  nodeComputeAllocatable(&node), // will be subtracted from below
 			ComputeAvailable:     nodeComputeAllocatable(&node), // will be subtracted from below
@@ -160,7 +169,7 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
 
 	sort.Strings(nodeNames)
 
-	nodeInfos := make([]schema.NodeInfo, len(nodeNames))
+	nodeInfos := make([]schema.WorkerNodeInfo, len(nodeNames))
 	for i, nodeName := range nodeNames {
 		nodeInfos[i] = *nodeInfoMap[nodeName]
 	}
@@ -179,3 +188,41 @@ func nodeComputeAllocatable(node *kcore.Node) userconfig.Compute {
 		Inf: infQty.Value(),
 	}
 }
+
+func getOperatorNodeInfos() ([]schema.NodeInfo, error) {
+	nodes, err := config.K8sAllNamspaces.ListNodesByLabel("operator", "true")
+	if err != nil {
+		return nil, err
+	}
+
+	nodeInfoMap := make(map[string]*schema.NodeInfo, len(nodes)) // node name -> info
+
+	for i := range nodes {
+		node := nodes[i]
+
+		instanceType := node.Labels["beta.kubernetes.io/instance-type"]
+		nodeGroupName := node.Labels["alpha.eksctl.io/nodegroup-name"]
+
+		price := aws.InstanceMetadatas[config.ClusterConfig.Region][instanceType].Price
+
+		nodeInfoMap[node.Name] = &schema.NodeInfo{
+			NodeGroupName: nodeGroupName,
+			InstanceType:  instanceType,
+			Price:         price,
+		}
+	}
+
+	nodeNames := make([]string, 0, len(nodeInfoMap))
+	for nodeName := range nodeInfoMap {
+		nodeNames = append(nodeNames, nodeName)
+	}
+
+	sort.Strings(nodeNames)
+
+	nodeInfos := make([]schema.NodeInfo, len(nodeNames))
+	for i, nodeName := range nodeNames {
+		nodeInfos[i] = *nodeInfoMap[nodeName]
+	}
+
+	return nodeInfos, nil
+}
diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go
index 18988fa277..d86073e4e7 100644
--- a/pkg/operator/schema/schema.go
+++ b/pkg/operator/schema/schema.go
@@ -26,16 +26,14 @@ import (
 
 type InfoResponse struct {
 	ClusterConfig      clusterconfig.InternalConfig `json:"cluster_config" yaml:"cluster_config"`
-	NodeInfos          []NodeInfo                   `json:"node_infos" yaml:"node_infos"`
+	WorkerNodeInfos    []WorkerNodeInfo             `json:"worker_node_infos" yaml:"worker_node_infos"`
+	OperatorNodeInfos  []NodeInfo                   `json:"operator_node_infos" yaml:"operator_node_infos"`
 	NumPendingReplicas int                          `json:"num_pending_replicas" yaml:"num_pending_replicas"`
 }
 
-type NodeInfo struct {
+type WorkerNodeInfo struct {
+	NodeInfo
 	Name                    string             `json:"name" yaml:"name"`
-	NodeGroupName           string             `json:"nodegroup_name" yaml:"nodegroup_name"`
-	InstanceType            string             `json:"instance_type" yaml:"instance_type"`
-	IsSpot                  bool               `json:"is_spot" yaml:"is_spot"`
-	Price                   float64            `json:"price" yaml:"price"`
 	NumReplicas             int                `json:"num_replicas" yaml:"num_replicas"`
 	NumAsyncGatewayReplicas int                `json:"num_async_gateway_replicas" yaml:"num_async_gateway_replicas"`
 	NumEnqueuerReplicas     int                `json:"num_enqueuer_replicas" yaml:"num_enqueuer_replicas"`
@@ -44,6 +42,13 @@ type NodeInfo struct {
 	ComputeUserRequested    userconfig.Compute `json:"compute_user_requested" yaml:"compute_user_requested"` // total resources requested by user on a node
 }
 
+type NodeInfo struct {
+	NodeGroupName string  `json:"nodegroup_name" yaml:"nodegroup_name"`
+	InstanceType  string  `json:"instance_type" yaml:"instance_type"`
+	IsSpot        bool    `json:"is_spot" yaml:"is_spot"`
+	Price         float64 `json:"price" yaml:"price"`
+}
+
 type DeployResult struct {
 	API     *APIResponse `json:"api"`
 	Message string       `json:"message"`
@@ -97,9 +102,9 @@ type APIVersion struct {
 
 type VerifyCortexResponse struct{}
 
-func (ir InfoResponse) GetNodesWithNodeGroupName(ngName string) []NodeInfo {
-	nodesInfo := []NodeInfo{}
-	for _, nodeInfo := range ir.NodeInfos {
+func (ir InfoResponse) GetNodesWithNodeGroupName(ngName string) []WorkerNodeInfo {
+	nodesInfo := []WorkerNodeInfo{}
+	for _, nodeInfo := range ir.WorkerNodeInfos {
 		if nodeInfo.NodeGroupName == ngName {
 			nodesInfo = append(nodesInfo, nodeInfo)
 		}

From d0b22fc9c4b2c6790f98161c9a2accd8eb20cf0f Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 19:07:12 +0300
Subject: [PATCH 18/25] Let the node exporter run on every node

---
 manager/manifests/prometheus-node-exporter.yaml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml
index 2ea1881eb7..b58e92a5d9 100644
--- a/manager/manifests/prometheus-node-exporter.yaml
+++ b/manager/manifests/prometheus-node-exporter.yaml
@@ -153,15 +153,12 @@ spec:
       hostPID: true
       nodeSelector:
         kubernetes.io/os: linux
-        prometheus: "true"
       securityContext:
         runAsNonRoot: true
         runAsUser: 65534
       serviceAccountName: node-exporter
       tolerations:
-        - key: prometheus
-          operator: Exists
-          effect: NoSchedule
+        - operator: Exists
       volumes:
         - hostPath:
             path: /sys

From dc79cbfd1860f96416540bcc7ec1a18a1e701c17 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 19:30:28 +0300
Subject: [PATCH 19/25] Fix istio hpa

---
 manager/manifests/istio.yaml.j2 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2
index 811bfbf1d2..629f49bb12 100644
--- a/manager/manifests/istio.yaml.j2
+++ b/manager/manifests/istio.yaml.j2
@@ -146,6 +146,7 @@ spec:
                 resource:
                   name: cpu
                   targetAverageUtilization: 90
+              - type: Resource
                 resource:
                   name: mem
                   targetAverageUtilization: 90

From d2c26e59ec67a869b82e812fec0993d13e58ec63 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 21:07:16 +0300
Subject: [PATCH 20/25] Change resource requests/limits

---
 manager/manifests/inferentia.yaml               | 2 +-
 manager/manifests/prometheus-node-exporter.yaml | 2 +-
 pkg/crds/config/manager/manager.yaml            | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/manager/manifests/inferentia.yaml b/manager/manifests/inferentia.yaml
index eb6a7db974..51331f6715 100644
--- a/manager/manifests/inferentia.yaml
+++ b/manager/manifests/inferentia.yaml
@@ -140,7 +140,7 @@ spec:
               value: "12345"
           resources:
             requests:
-              cpu: 100m
+              cpu: 50m
               memory: 100Mi
 
 ---
diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml
index b58e92a5d9..9c3a483f81 100644
--- a/manager/manifests/prometheus-node-exporter.yaml
+++ b/manager/manifests/prometheus-node-exporter.yaml
@@ -115,7 +115,7 @@ spec:
               cpu: 250m
               memory: 180Mi
             requests:
-              cpu: 100m
+              cpu: 40m
               memory: 180Mi
           volumeMounts:
             - mountPath: /host/sys
diff --git a/pkg/crds/config/manager/manager.yaml b/pkg/crds/config/manager/manager.yaml
index 75f03b40d3..48fb8ad0e1 100644
--- a/pkg/crds/config/manager/manager.yaml
+++ b/pkg/crds/config/manager/manager.yaml
@@ -46,10 +46,10 @@ spec:
           periodSeconds: 10
         resources:
           limits:
-            cpu: 200m
+            cpu: 300m
             memory: 100Mi
           requests:
-            cpu: 200m
+            cpu: 100m
             memory: 80Mi
         volumeMounts:
           - mountPath: /mnt/cluster.yaml

From 25f51668dcdf532a007d270585367c54aa648c63 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 22:52:36 +0300
Subject: [PATCH 21/25] Have the prometheus instance type configurable

---
 cli/cmd/cluster.go                        |  4 +-
 cli/cmd/lib_cluster_config.go             |  4 +-
 docs/clusters/management/create.md        |  6 ++-
 manager/generate_eks.py                   |  5 ++-
 pkg/lib/aws/ec2.go                        | 40 ++++++++++++++++++
 pkg/types/clusterconfig/cluster_config.go | 49 +++++++++++++++++++----
 pkg/types/clusterconfig/config_key.go     |  1 +
 pkg/types/clusterconfig/errors.go         | 16 ++++++++
 8 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index cd30b8a75a..7d87bf7d3f 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -849,7 +849,7 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 	eksPrice := aws.EKSPrices[clusterConfig.Region]
 	operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
 	operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
-	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.xlarge"].Price
+	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
 	prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
 	nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
@@ -907,7 +907,7 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 	fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))
 
 	rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
-	rows = append(rows, []interface{}{"1 t3.xlarge instance (cortex system)", s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (cortex system)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
index 39a00a642a..5579e78b35 100644
--- a/cli/cmd/lib_cluster_config.go
+++ b/cli/cmd/lib_cluster_config.go
@@ -163,7 +163,7 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
 func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) {
 	eksPrice := aws.EKSPrices[clusterConfig.Region]
 	operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
-	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.xlarge"].Price
+	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
 	operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
@@ -230,7 +230,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 	maxOperatorNodeGroupPrice := 25 * (operatorInstancePrice + operatorEBSPrice)
 	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
 	rows = append(rows, []interface{}{"1-25 t3.medium instances (cortex system)", fmt.Sprintf("%s - %s (depending on load)", s.DollarsAndTenthsOfCents(minOperatorNodeGroupPrice), s.DollarsAndTenthsOfCents(maxOperatorNodeGroupPrice))})
-	rows = append(rows, []interface{}{"1 t3.xlarge instance (cortex system)", s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (cortex system)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md
index a3abecdd85..8bed03e645 100644
--- a/docs/clusters/management/create.md
+++ b/docs/clusters/management/create.md
@@ -86,11 +86,15 @@ tags:  # <string>: <string> map of key/value pairs
 # SSL certificate ARN (only necessary when using a custom domain)
 ssl_certificate_arn:
 
-# List of IAM policies to attach to your Cortex APIs
+# list of IAM policies to attach to your Cortex APIs
 iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
 
 # primary CIDR block for the cluster's VPC
 vpc_cidr: 192.168.0.0/16
+
+# instance type for prometheus
+# use a bigger instance if the cluster is expected to grow past the thousand node (or api replica) threshold
+prometheus_instance_type: "t3.medium"
 ```
 
 The docker images used by the cluster can also be overridden. They can be configured by adding any of these keys to your cluster configuration file (default values are shown):
diff --git a/manager/generate_eks.py b/manager/generate_eks.py
index 89f81b7dce..aa22c9a9a9 100644
--- a/manager/generate_eks.py
+++ b/manager/generate_eks.py
@@ -273,6 +273,7 @@ def generate_eks(
     cluster_config = yaml.safe_load(cluster_config_file)
     region = cluster_config["region"]
     name = cluster_config["cluster_name"]
+    prometheus_instance_type = cluster_config["prometheus_instance_type"]
     ami_map = json.load(ami_json_file)[K8S_VERSION][region]
 
     eks = {
@@ -322,9 +323,9 @@ def generate_eks(
 
     prometheus_nodegroup = default_nodegroup(cluster_config)
     prometheus_settings = {
-        "ami": get_ami(ami_map, "t3.xlarge"),
+        "ami": get_ami(ami_map, prometheus_instance_type),
         "name": "cx-prometheus",
-        "instanceType": "t3.xlarge",
+        "instanceType": prometheus_instance_type,
         "minSize": 1,
         "maxSize": 1,
         "desiredCapacity": 1,
diff --git a/pkg/lib/aws/ec2.go b/pkg/lib/aws/ec2.go
index 338a719c91..66b2f94fb6 100644
--- a/pkg/lib/aws/ec2.go
+++ b/pkg/lib/aws/ec2.go
@@ -137,6 +137,46 @@ func IsAMDGPUInstance(instanceType string) (bool, error) {
 	return false, nil
 }
 
+func IsNvidiaGPUInstance(instanceType string) (bool, error) {
+	parsedType, err := ParseInstanceType(instanceType)
+	if err != nil {
+		return false, err
+	}
+
+	if !_gpuInstanceFamilies.Has(parsedType.Family) {
+		return false, nil
+	}
+
+	if !parsedType.Capabilities.Has("a") {
+		return true, nil
+	}
+
+	return false, nil
+}
+
+func IsGPUInstance(instanceType string) (bool, error) {
+	isAMDGPU, err := IsAMDGPUInstance(instanceType)
+	if err != nil {
+		return false, err
+	}
+
+	isNvidiaGPU, err := IsNvidiaGPUInstance(instanceType)
+	if err != nil {
+		return false, err
+	}
+
+	return isAMDGPU || isNvidiaGPU, nil
+}
+
+func IsInferentiaInstance(instanceType string) (bool, error) {
+	parsedType, err := ParseInstanceType(instanceType)
+	if err != nil {
+		return false, err
+	}
+
+	return parsedType.Family == "inf", nil
+}
+
 func IsMacInstance(instanceType string) (bool, error) {
 	parsedType, err := ParseInstanceType(instanceType)
 	if err != nil {
diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go
index 0a530fed0b..a0b5d646c3 100644
--- a/pkg/types/clusterconfig/cluster_config.go
+++ b/pkg/types/clusterconfig/cluster_config.go
@@ -55,10 +55,8 @@ const (
 )
 
 var (
-	_operatorNodeGroupInstanceType       = "t3.medium"
-	_operatorNodeGroupRequiredOnDemand   = int64(25)
-	_prometheusNodeGroupInstanceType     = "t3.xlarge"
-	_prometheusNodeGroupRequiredOnDemand = int64(1)
+	_operatorNodeGroupInstanceType     = "t3.medium"
+	_operatorNodeGroupRequiredOnDemand = int64(25)
 
 	_maxNodeGroupLengthWithPrefix = 32
 	_maxNodeGroupLength           = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws-
@@ -88,8 +86,9 @@ type CoreConfig struct {
 	IstioNamespace string `json:"istio_namespace" yaml:"istio_namespace"`
 
 	// User-specifiable fields
-	ClusterName string `json:"cluster_name" yaml:"cluster_name"`
-	Region      string `json:"region" yaml:"region"`
+	ClusterName            string `json:"cluster_name" yaml:"cluster_name"`
+	Region                 string `json:"region" yaml:"region"`
+	PrometheusInstanceType string `json:"prometheus_instance_type" yaml:"prometheus_instance_type"`
 
 	// User-specifiable fields
 	ImageOperator                   string `json:"image_operator" yaml:"image_operator"`
@@ -336,6 +335,14 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{
 			Validator: RegionValidator,
 		},
 	},
+	{
+		StructField: "PrometheusInstanceType",
+		StringValidation: &cr.StringValidation{
+			MinLength: 1,
+			Default:   "t3.medium",
+			Validator: validatePrometheusInstanceType,
+		},
+	},
 	{
 		StructField: "Telemetry",
 		BoolValidation: &cr.BoolValidation{
@@ -915,8 +922,8 @@ func (cc *Config) validate(awsClient *aws.Client) error {
 			RequiredOnDemandInstances: int64(_operatorNodeGroupRequiredOnDemand),
 		},
 		{
-			InstanceType:              _prometheusNodeGroupInstanceType,
-			RequiredOnDemandInstances: int64(_prometheusNodeGroupRequiredOnDemand),
+			InstanceType:              cc.PrometheusInstanceType,
+			RequiredOnDemandInstances: 1,
 		},
 	}
 	for _, nodeGroup := range cc.NodeGroups {
@@ -1486,6 +1493,31 @@ func validateInstanceType(instanceType string) (string, error) {
 	return instanceType, nil
 }
 
+func validatePrometheusInstanceType(instanceType string) (string, error) {
+	_, err := validateInstanceType(instanceType)
+	if err != nil {
+		return "", err
+	}
+
+	isNvidiaGPU, err := aws.IsNvidiaGPUInstance(instanceType)
+	if err != nil {
+		return "", err
+	}
+	if isNvidiaGPU {
+		return "", ErrorNvidiaGPUInstancesNotSupported(instanceType)
+	}
+
+	isInf, err := aws.IsInferentiaInstance(instanceType)
+	if err != nil {
+		return "", err
+	}
+	if isInf {
+		return "", ErrorInferentiaInstancesNotSupported(instanceType)
+	}
+
+	return instanceType, nil
+}
+
 func validateInstanceDistribution(instances []string) ([]string, error) {
 	for _, instance := range instances {
 		_, err := validateInstanceType(instance)
@@ -1627,6 +1659,7 @@ func (cc *CoreConfig) TelemetryEvent() map[string]interface{} {
 	}
 
 	event["region"] = cc.Region
+	event["prometheus_instance_type"] = cc.PrometheusInstanceType
 
 	if !strings.HasPrefix(cc.ImageOperator, "cortexlabs/") {
 		event["image_operator._is_custom"] = true
diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go
index 19df34d3f2..2495cbdba1 100644
--- a/pkg/types/clusterconfig/config_key.go
+++ b/pkg/types/clusterconfig/config_key.go
@@ -22,6 +22,7 @@ const (
 
 	ClusterNameKey                         = "cluster_name"
 	RegionKey                              = "region"
+	PrometheusInstanceTypeKey              = "prometheus_instance_type"
 	NodeGroupsKey                          = "node_groups"
 	InstanceTypeKey                        = "instance_type"
 	AcceleratorTypeKey                     = "accelerator_type"
diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go
index b5126d94bc..45cb0d7632 100644
--- a/pkg/types/clusterconfig/errors.go
+++ b/pkg/types/clusterconfig/errors.go
@@ -46,6 +46,8 @@ const (
 	ErrSpotPriceGreaterThanMaxPrice           = "clusterconfig.spot_price_greater_than_max_price"
 	ErrInstanceTypeNotSupportedByCortex       = "clusterconfig.instance_type_not_supported_by_cortex"
 	ErrAMDGPUInstancesNotSupported            = "clusterconfig.amd_gpu_instances_not_supported"
+	ErrNvidiaGPUInstancesNotSupported         = "clusterconfig.nvidia_gpu_instance_not_supported"
+	ErrInferentiaInstancesNotSupported        = "clusterconfig.inferentia_instances_not_supported"
 	ErrMacInstancesNotSupported               = "clusterconfig.mac_instances_not_supported"
 	ErrAtLeastOneInstanceDistribution         = "clusterconfig.at_least_one_instance_distribution"
 	ErrNoCompatibleSpotInstanceFound          = "clusterconfig.no_compatible_spot_instance_found"
@@ -202,6 +204,20 @@ func ErrorAMDGPUInstancesNotSupported(instanceType string) error {
 	})
 }
 
+func ErrorNvidiaGPUInstancesNotSupported(instanceType string) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrNvidiaGPUInstancesNotSupported,
+		Message: fmt.Sprintf("Nvidia GPU instances (including %s) are not supported by cortex", instanceType),
+	})
+}
+
+func ErrorInferentiaInstancesNotSupported(instanceType string) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrInferentiaInstancesNotSupported,
+		Message: fmt.Sprintf("Inferentia instances (including %s) are not supported by cortex", instanceType),
+	})
+}
+
 func ErrorMacInstancesNotSupported(instanceType string) error {
 	return errors.WithStack(&errors.Error{
 		Kind:    ErrMacInstancesNotSupported,

From 21a5d9ccf0279b8dd2bce1eaab140690dba4a608 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Fri, 2 Jul 2021 23:00:20 +0300
Subject: [PATCH 22/25] Nits

---
 docs/clusters/management/create.md        | 2 +-
 pkg/types/clusterconfig/cluster_config.go | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md
index 8bed03e645..c63332b302 100644
--- a/docs/clusters/management/create.md
+++ b/docs/clusters/management/create.md
@@ -93,7 +93,7 @@ iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
 vpc_cidr: 192.168.0.0/16
 
 # instance type for prometheus
-# use a bigger instance if the cluster is expected to grow past the thousand node (or api replica) threshold
+# use a bigger instance if the cluster is expected to grow past the thousand node (or api replica) mark
 prometheus_instance_type: "t3.medium"
 ```
 
diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go
index a0b5d646c3..c4647e8559 100644
--- a/pkg/types/clusterconfig/cluster_config.go
+++ b/pkg/types/clusterconfig/cluster_config.go
@@ -55,8 +55,8 @@ const (
 )
 
 var (
-	_operatorNodeGroupInstanceType     = "t3.medium"
-	_operatorNodeGroupRequiredOnDemand = int64(25)
+	_operatorNodeGroupInstanceType        = "t3.medium"
+	_operatorNodeGroupMaxRequiredOnDemand = int64(25)
 
 	_maxNodeGroupLengthWithPrefix = 32
 	_maxNodeGroupLength           = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws-
@@ -919,7 +919,7 @@ func (cc *Config) validate(awsClient *aws.Client) error {
 	instances := []aws.InstanceTypeRequests{
 		{
 			InstanceType:              _operatorNodeGroupInstanceType,
-			RequiredOnDemandInstances: int64(_operatorNodeGroupRequiredOnDemand),
+			RequiredOnDemandInstances: int64(_operatorNodeGroupMaxRequiredOnDemand),
 		},
 		{
 			InstanceType:              cc.PrometheusInstanceType,

From 65781cb386bb37b43a8743ba7c622af74d44cbf2 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Fri, 2 Jul 2021 13:30:56 -0700
Subject: [PATCH 23/25] Update create.md

---
 docs/clusters/management/create.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md
index c63332b302..b65e2dfa84 100644
--- a/docs/clusters/management/create.md
+++ b/docs/clusters/management/create.md
@@ -92,8 +92,7 @@ iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
 # primary CIDR block for the cluster's VPC
 vpc_cidr: 192.168.0.0/16
 
-# instance type for prometheus
-# use a bigger instance if the cluster is expected to grow past the thousand node (or api replica) mark
+# instance type for prometheus (use a larger instance for clusters exceeding 500 nodes)
 prometheus_instance_type: "t3.medium"
 ```
 

From dcff049fb0c6ecb8131f7f62dca245f220d6a294 Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Sat, 3 Jul 2021 00:31:43 +0300
Subject: [PATCH 24/25] Address some PR comments

---
 cli/cmd/cluster.go                    | 2 +-
 cli/cmd/lib_cluster_config.go         | 2 +-
 manager/manifests/event-exporter.yaml | 4 ++++
 manager/manifests/istio.yaml.j2       | 2 +-
 manager/manifests/metrics-server.yaml | 7 +++++--
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index 7d87bf7d3f..375f1ca0d8 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -907,7 +907,7 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 	fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))
 
 	rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
-	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (cortex system)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
index f2e47e71e6..cba8007dd2 100644
--- a/cli/cmd/lib_cluster_config.go
+++ b/cli/cmd/lib_cluster_config.go
@@ -230,7 +230,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 	maxOperatorNodeGroupPrice := 25 * (operatorInstancePrice + operatorEBSPrice)
 	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
 	rows = append(rows, []interface{}{"1-25 t3.medium instances (cortex system)", fmt.Sprintf("%s - %s (depending on load)", s.DollarsAndTenthsOfCents(minOperatorNodeGroupPrice), s.DollarsAndTenthsOfCents(maxOperatorNodeGroupPrice))})
-	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (cortex system)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
diff --git a/manager/manifests/event-exporter.yaml b/manager/manifests/event-exporter.yaml
index ae0957d9aa..ab4847c4a7 100644
--- a/manager/manifests/event-exporter.yaml
+++ b/manager/manifests/event-exporter.yaml
@@ -82,6 +82,10 @@ spec:
           volumeMounts:
             - mountPath: /data
               name: event-exporter-config
+          resources:
+            requests:
+              cpu: 20m
+              memory: 50Mi
       volumes:
         - name: event-exporter-config
           configMap:
diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2
index 7f01fb2529..0355896a89 100644
--- a/manager/manifests/istio.yaml.j2
+++ b/manager/manifests/istio.yaml.j2
@@ -128,7 +128,7 @@ spec:
                 targetPort: 15443
           resources:
             requests:
-              cpu: 300m
+              cpu: 512m
               memory: 128Mi
             limits:
               cpu: 1500m
diff --git a/manager/manifests/metrics-server.yaml b/manager/manifests/metrics-server.yaml
index bdaad365d9..6e4e0de8bb 100644
--- a/manager/manifests/metrics-server.yaml
+++ b/manager/manifests/metrics-server.yaml
@@ -174,8 +174,11 @@ spec:
             periodSeconds: 10
           resources:
             requests:
-              cpu: 100m
-              memory: 200Mi
+              cpu: 50m
+              memory: 100Mi
+            limits:
+              cpu: 200m
+              memory: 500Mi
           securityContext:
             readOnlyRootFilesystem: true
             runAsNonRoot: true

From f64d0facf2f67a8ed52f1f4b4414facdc22f74ce Mon Sep 17 00:00:00 2001
From: Robert Lucian Chiriac <robert.lucian.chiriac@gmail.com>
Date: Sat, 3 Jul 2021 01:24:12 +0300
Subject: [PATCH 25/25] Address PR comments

---
 cli/cmd/lib_cluster_config.go             | 12 +++++-------
 manager/manifests/istio.yaml.j2           |  2 +-
 pkg/types/clusterconfig/cluster_config.go | 11 +++++------
 pkg/types/clusterconfig/errors.go         | 10 +++++-----
 4 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
index cba8007dd2..1da6de2f7a 100644
--- a/cli/cmd/lib_cluster_config.go
+++ b/cli/cmd/lib_cluster_config.go
@@ -186,10 +186,9 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 	rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})
 
 	ngNameToSpotInstancesUsed := map[string]int{}
-	baseMinPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
-	baseMaxPrice := eksPrice + 25*(operatorInstancePrice+operatorEBSPrice) + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
-	totalMinPrice := baseMinPrice
-	totalMaxPrice := baseMaxPrice
+	fixedPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
+	totalMinPrice := fixedPrice
+	totalMaxPrice := fixedPrice
 	for _, ng := range clusterConfig.NodeGroups {
 		apiInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][ng.InstanceType].Price
 		apiEBSPrice := aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceGB * float64(ng.InstanceVolumeSize) / 30 / 24
@@ -226,10 +225,9 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 		rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr})
 	}
 
-	minOperatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice
-	maxOperatorNodeGroupPrice := 25 * (operatorInstancePrice + operatorEBSPrice)
+	operatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice
 	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
-	rows = append(rows, []interface{}{"1-25 t3.medium instances (cortex system)", fmt.Sprintf("%s - %s (depending on load)", s.DollarsAndTenthsOfCents(minOperatorNodeGroupPrice), s.DollarsAndTenthsOfCents(maxOperatorNodeGroupPrice))})
+	rows = append(rows, []interface{}{"1 t3.medium instance (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
 	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
 
diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2
index 0355896a89..5b4316ca8e 100644
--- a/manager/manifests/istio.yaml.j2
+++ b/manager/manifests/istio.yaml.j2
@@ -163,7 +163,7 @@ spec:
     gateways:
       istio-ingressgateway:
         runAsRoot: true
-        autoscaleEnabled: false
+        autoscaleEnabled: true
         secretVolumes:
         - name: customgateway-certs
           secretName: istio-customgateway-certs
diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go
index ca850ed6f7..93beb9a32c 100644
--- a/pkg/types/clusterconfig/cluster_config.go
+++ b/pkg/types/clusterconfig/cluster_config.go
@@ -56,8 +56,7 @@ const (
 )
 
 var (
-	_operatorNodeGroupInstanceType        = "t3.medium"
-	_operatorNodeGroupMaxRequiredOnDemand = int64(25)
+	_operatorNodeGroupInstanceType = "t3.medium"
 
 	_maxNodeGroupLengthWithPrefix = 32
 	_maxNodeGroupLength           = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws-
@@ -921,7 +920,7 @@ func (cc *Config) validate(awsClient *aws.Client) error {
 	instances := []aws.InstanceTypeRequests{
 		{
 			InstanceType:              _operatorNodeGroupInstanceType,
-			RequiredOnDemandInstances: int64(_operatorNodeGroupMaxRequiredOnDemand),
+			RequiredOnDemandInstances: 1,
 		},
 		{
 			InstanceType:              cc.PrometheusInstanceType,
@@ -1518,12 +1517,12 @@ func validatePrometheusInstanceType(instanceType string) (string, error) {
 		return "", err
 	}
 
-	isNvidiaGPU, err := aws.IsNvidiaGPUInstance(instanceType)
+	isGPU, err := aws.IsGPUInstance(instanceType)
 	if err != nil {
 		return "", err
 	}
-	if isNvidiaGPU {
-		return "", ErrorNvidiaGPUInstancesNotSupported(instanceType)
+	if isGPU {
+		return "", ErrorGPUInstancesNotSupported(instanceType)
 	}
 
 	isInf, err := aws.IsInferentiaInstance(instanceType)
diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go
index cbae233b8f..56d1be5115 100644
--- a/pkg/types/clusterconfig/errors.go
+++ b/pkg/types/clusterconfig/errors.go
@@ -46,7 +46,7 @@ const (
 	ErrSpotPriceGreaterThanMaxPrice           = "clusterconfig.spot_price_greater_than_max_price"
 	ErrInstanceTypeNotSupportedByCortex       = "clusterconfig.instance_type_not_supported_by_cortex"
 	ErrAMDGPUInstancesNotSupported            = "clusterconfig.amd_gpu_instances_not_supported"
-	ErrNvidiaGPUInstancesNotSupported         = "clusterconfig.nvidia_gpu_instance_not_supported"
+	ErrGPUInstancesNotSupported               = "clusterconfig.gpu_instance_not_supported"
 	ErrInferentiaInstancesNotSupported        = "clusterconfig.inferentia_instances_not_supported"
 	ErrMacInstancesNotSupported               = "clusterconfig.mac_instances_not_supported"
 	ErrAtLeastOneInstanceDistribution         = "clusterconfig.at_least_one_instance_distribution"
@@ -204,17 +204,17 @@ func ErrorAMDGPUInstancesNotSupported(instanceType string) error {
 	})
 }
 
-func ErrorNvidiaGPUInstancesNotSupported(instanceType string) error {
+func ErrorGPUInstancesNotSupported(instanceType string) error {
 	return errors.WithStack(&errors.Error{
-		Kind:    ErrNvidiaGPUInstancesNotSupported,
-		Message: fmt.Sprintf("Nvidia GPU instances (including %s) are not supported by cortex", instanceType),
+		Kind:    ErrGPUInstancesNotSupported,
+		Message: fmt.Sprintf("GPU instances (including %s) are not supported", instanceType),
 	})
 }
 
 func ErrorInferentiaInstancesNotSupported(instanceType string) error {
 	return errors.WithStack(&errors.Error{
 		Kind:    ErrInferentiaInstancesNotSupported,
-		Message: fmt.Sprintf("Inferentia instances (including %s) are not supported by cortex", instanceType),
+		Message: fmt.Sprintf("Inferentia instances (including %s) are not supported", instanceType),
 	})
 }