diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index 1616565b71..375f1ca0d8 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -770,11 +770,12 @@ func cmdInfo(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig, st
 			infoInterface = infoResponse.ClusterConfig.Config
 		} else {
 			infoInterface = map[string]interface{}{
-				"cluster_config":    infoResponse.ClusterConfig.Config,
-				"cluster_metadata":  infoResponse.ClusterConfig.OperatorMetadata,
-				"node_infos":        infoResponse.NodeInfos,
-				"endpoint_operator": operatorEndpoint,
-				"endpoint_api":      apiEndpoint,
+				"cluster_config":      infoResponse.ClusterConfig.Config,
+				"cluster_metadata":    infoResponse.ClusterConfig.OperatorMetadata,
+				"worker_node_infos":   infoResponse.WorkerNodeInfos,
+				"operator_node_infos": infoResponse.OperatorNodeInfos,
+				"endpoint_operator":   operatorEndpoint,
+				"endpoint_api":        apiEndpoint,
 			}
 		}
 
@@ -848,6 +849,8 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 	eksPrice := aws.EKSPrices[clusterConfig.Region]
 	operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
 	operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
+	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
+	prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
 	nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
 	natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
@@ -891,17 +894,20 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 		totalNodeGroupsPrice += totalEBSPrice + totalInstancePrice
 	}
 
+	operatorNodeGroupPrice := float64(len(infoResponse.OperatorNodeInfos)) * (operatorInstancePrice + operatorEBSPrice)
+	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
+
 	var natTotalPrice float64
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
 		natTotalPrice = natUnitPrice
 	} else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway {
 		natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones))
 	}
-	totalPrice := eksPrice + totalNodeGroupsPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + nlbPrice*2 + natTotalPrice
+	totalPrice := eksPrice + totalNodeGroupsPrice + operatorNodeGroupPrice + prometheusNodeGroupPrice + nlbPrice*2 + natTotalPrice
 	fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))
 
-	operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
-	rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("%d t3.medium %s (cortex system)", len(infoResponse.OperatorNodeInfos), s.PluralS("instance", len(infoResponse.OperatorNodeInfos))), s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice*2) + " total"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
@@ -919,11 +925,11 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
 }
 
 func printInfoNodes(infoResponse *schema.InfoResponse) {
-	numAPIInstances := len(infoResponse.NodeInfos)
+	numAPIInstances := len(infoResponse.WorkerNodeInfos)
 
 	var totalReplicas int
 	var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
-	for _, nodeInfo := range infoResponse.NodeInfos {
+	for _, nodeInfo := range infoResponse.WorkerNodeInfos {
 		totalReplicas += nodeInfo.NumReplicas
 		if nodeInfo.ComputeUserCapacity.GPU > 0 {
 			doesClusterHaveGPUs = true
@@ -946,7 +952,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 
 	fmt.Printf(console.Bold("\nyour cluster has %d API %s running across %d %s%s\n"), totalReplicas, s.PluralS("replica", totalReplicas), numAPIInstances, s.PluralS("instance", numAPIInstances), pendingReplicasStr)
 
-	if len(infoResponse.NodeInfos) == 0 {
+	if len(infoResponse.WorkerNodeInfos) == 0 {
 		return
 	}
 
@@ -963,7 +969,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 	}
 
 	var rows [][]interface{}
-	for _, nodeInfo := range infoResponse.NodeInfos {
+	for _, nodeInfo := range infoResponse.WorkerNodeInfos {
 		lifecycle := "on-demand"
 		if nodeInfo.IsSpot {
 			lifecycle = "spot"
diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
index edd6cd7732..1da6de2f7a 100644
--- a/cli/cmd/lib_cluster_config.go
+++ b/cli/cmd/lib_cluster_config.go
@@ -163,7 +163,9 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
 func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient *aws.Client, disallowPrompt bool) {
 	eksPrice := aws.EKSPrices[clusterConfig.Region]
 	operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
+	prometheusInstancePrice := aws.InstanceMetadatas[clusterConfig.Region][clusterConfig.PrometheusInstanceType].Price
 	operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
+	prometheusEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp3"].PriceGB * 20 / 30 / 24
 	metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * (40 + 2) / 30 / 24
 	nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
 	natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
@@ -184,7 +186,7 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 	rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})
 
 	ngNameToSpotInstancesUsed := map[string]int{}
-	fixedPrice := eksPrice + 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice + 2*nlbPrice + natTotalPrice
+	fixedPrice := eksPrice + operatorInstancePrice + operatorEBSPrice + prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice + 2*nlbPrice + natTotalPrice
 	totalMinPrice := fixedPrice
 	totalMaxPrice := fixedPrice
 	for _, ng := range clusterConfig.NodeGroups {
@@ -223,8 +225,10 @@ func confirmInstallClusterConfig(clusterConfig *clusterconfig.Config, awsClient
 		rows = append(rows, []interface{}{workerInstanceStr, workerPriceStr})
 	}
 
-	operatorPrice := 2*(operatorInstancePrice+operatorEBSPrice) + metricsEBSPrice
-	rows = append(rows, []interface{}{"2 t3.medium instances (cortex system)", s.DollarsAndTenthsOfCents(operatorPrice)})
+	operatorNodeGroupPrice := operatorInstancePrice + operatorEBSPrice
+	prometheusNodeGroupPrice := prometheusInstancePrice + prometheusEBSPrice + metricsEBSPrice
+	rows = append(rows, []interface{}{"1 t3.medium instance (cortex system)", s.DollarsAndTenthsOfCents(operatorNodeGroupPrice)})
+	rows = append(rows, []interface{}{fmt.Sprintf("1 %s instance (prometheus)", clusterConfig.PrometheusInstanceType), s.DollarsAndTenthsOfCents(prometheusNodeGroupPrice)})
 	rows = append(rows, []interface{}{"2 network load balancers", s.DollarsMaxPrecision(nlbPrice) + " each"})
 
 	if clusterConfig.NATGateway == clusterconfig.SingleNATGateway {
diff --git a/dev/prometheus.md b/dev/prometheus.md
new file mode 100644
index 0000000000..1cc801c37f
--- /dev/null
+++ b/dev/prometheus.md
@@ -0,0 +1,211 @@
+# Metrics
+
+## Updating metrics
+
+When new metrics/labels/exporters are added to be scraped by prometheus, make sure the following list **is updated** as well to keep track of what metrics/labels are needed or not.
+
+The following is a list of metrics that are currently in use.
+
+#### Cortex metrics
+
+1. cortex_in_flight_requests with the following labels:
+    1. api_name
+1. cortex_async_request_count with the following labels:
+    1. api_name
+    1. api_kind
+    1. status_code
+1. cortex_async_queue_length with the following labels:
+    1. api_name
+    1. api_kind
+1. cortex_async_latency_bucket with the following labels:
+    1. api_name
+    1. api_kind
+1. cortex_batch_succeeded with the following labels:
+    1. api_name
+1. cortex_batch_failed with the following labels:
+    1. api_name
+1. cortex_time_per_batch_sum with the following labels:
+    1. api_name
+1. cortex_time_per_batch_count with the following labels:
+    1. api_name
+
+#### Istio metrics
+
+1. istio_requests_total with the following labels:
+    1. destination_service
+    1. response_code
+1. istio_request_duration_milliseconds_bucket with the following labels:
+    1. destination_service
+    1. le
+1. istio_request_duration_milliseconds_sum with the following labels:
+    1. destination_service
+1. istio_request_duration_milliseconds_count with the following labels:
+    1. destination_service
+
+#### Kubelet metrics
+1. container_cpu_usage_seconds_total with the following labels:
+    1. pod
+    1. container
+    1. name
+1. container_memory_working_set_bytes with the following labels:
+    1. pod
+    1. name
+    1. container
+
+#### Kube-state-metrics metrics
+
+1. kube_pod_container_resource_requests with the following labels:
+    1. exported_pod
+    1. resource
+    1. exported_container (required for not dropping the values for each container of each pod)
+1. kube_pod_info with the following labels:
+    1. exported_pod
+1. kube_deployment_status_replicas_available with the following labels:
+    1. deployment
+1. kube_job_status_active with the following labels:
+    1. job_name
+
+#### DCGM metrics
+
+1. DCGM_FI_DEV_GPU_UTIL with the following labels:
+    1. exported_pod
+1. DCGM_FI_DEV_FB_USED with the following labels:
+    1. exported_pod
+1. DCGM_FI_DEV_FB_FREE with the following labels:
+    1. exported_pod
+
+#### Node metrics
+
+1. node_cpu_seconds_total with the following labels:
+    1. job
+    1. mode
+    1. instance
+    1. cpu
+1. node_load1 with the following labels:
+    1. job
+    1. instance
+1. node_load5 with the following labels:
+    1. job
+    1. instance
+1. node_load15 with the following labels:
+    1. job
+    1. instance
+1. node_exporter_build_info with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemTotal_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemFree_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_Buffers_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_Cached_bytes with the following labels:
+    1. job
+    1. instance
+1. node_memory_MemAvailable_bytes with the following labels:
+    1. job
+    1. instance
+1. node_disk_read_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_disk_written_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_disk_io_time_seconds_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_filesystem_size_bytes with the following labels:
+    1. job
+    1. instance
+    1. fstype
+    1. mountpoint
+    1. device
+1. node_filesystem_avail_bytes with the following labels:
+    1. job
+    1. instance
+    1. fstype
+    1. device
+1. node_network_receive_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+1. node_network_transmit_bytes_total with the following labels:
+    1. job
+    1. instance
+    1. device
+
+##### Prometheus rules for the node exporter
+
+1. instance:node_cpu_utilisation:rate1m from the following metrics:
+    1. node_cpu_seconds_total with the following labels:
+        1. job
+        1. mode
+1. instance:node_num_cpu:sum from the following metrics:
+    1. node_cpu_seconds_total with the following labels:
+        1. job
+1. instance:node_load1_per_cpu:ratio from the following metrics:
+    1. node_load1 with the following labels:
+        1. job
+1. instance:node_memory_utilisation:ratio from the following metrics:
+    1. node_memory_MemTotal_bytes with the following labels:
+        1. job
+    1. node_memory_MemAvailable_bytes with the following labels:
+        1. job
+1. instance:node_vmstat_pgmajfault:rate1m with the following metrics:
+    1. node_vmstat_pgmajfault with the following labels:
+        1. job
+1. instance_device:node_disk_io_time_seconds:rate1m with the following metrics:
+    1. node_disk_io_time_seconds_total with the following labels:
+        1. job
+        1. device
+1. instance_device:node_disk_io_time_weighted_seconds:rate1m with the following metrics:
+    1. node_disk_io_time_weighted_seconds with the following labels:
+        1. job
+        1. device
+1. instance:node_network_receive_bytes_excluding_lo:rate1m with the following metrics:
+    1. node_network_receive_bytes_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_transmit_bytes_excluding_lo:rate1m with the following metrics:
+    1. node_network_transmit_bytes_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_receive_drop_excluding_lo:rate1m with the following metrics:
+    1. node_network_receive_drop_total with the following labels:
+        1. job
+        1. device
+1. instance:node_network_transmit_drop_excluding_lo:rate1m with the following metrics:
+    1. node_network_transmit_drop_total with the following labels:
+        1. job
+        1. device
+
+## Re-introducing dropped metrics/labels
+
+If you need to add some metrics/labels back for some particular use case, comment out every `metricRelabelings:` section (except the one from the `prometheus-operator.yaml` file), determine which metrics/labels you want to add back (i.e. by using the explorer from Grafana) and then re-edit the appropriate `metricRelabelings:` sections to account for the un-dropped metrics/labels.
+
+## Prometheus Analysis
+
+### Go Pprof
+
+To analyse the memory allocations of prometheus, run `kubectl port-forward prometheus-prometheus-0 9090:9090`, and then run `go tool pprof -symbolize=remote -inuse_space localhost:9090/debug/pprof/heap`. Once you get the interpreter, you can run `top` or `dot` for a more detailed hierarchy of the memory usage.
+
+### TSDB
+
+To analyse the TSDB of prometheus, exec into the `prometheus-prometheus-0` pod, `cd` into `/tmp`, and run the following code-block:
+
+```bash
+wget https://github.com/prometheus/prometheus/releases/download/v1.7.3/prometheus-1.7.3.linux-amd64.tar.gz
+tar -xzf prometheus-*
+cd prometheus-*
+./tsdb analyze /prometheus | less
+```
+
+*Useful link: https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality*
+
+Or you can go to `localhost:9090` -> `Status` -> `TSDB Status`, but it's not as complete as running a binary analysis.
diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md
index a3abecdd85..b65e2dfa84 100644
--- a/docs/clusters/management/create.md
+++ b/docs/clusters/management/create.md
@@ -86,11 +86,14 @@ tags:  # <string>: <string> map of key/value pairs
 # SSL certificate ARN (only necessary when using a custom domain)
 ssl_certificate_arn:
 
-# List of IAM policies to attach to your Cortex APIs
+# list of IAM policies to attach to your Cortex APIs
 iam_policy_arns: ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
 
 # primary CIDR block for the cluster's VPC
 vpc_cidr: 192.168.0.0/16
+
+# instance type for prometheus (use a larger instance for clusters exceeding 500 nodes)
+prometheus_instance_type: "t3.medium"
 ```
 
 The docker images used by the cluster can also be overridden. They can be configured by adding any of these keys to your cluster configuration file (default values are shown):
diff --git a/manager/generate_eks.py b/manager/generate_eks.py
index 38d513f168..aa22c9a9a9 100644
--- a/manager/generate_eks.py
+++ b/manager/generate_eks.py
@@ -273,6 +273,7 @@ def generate_eks(
     cluster_config = yaml.safe_load(cluster_config_file)
     region = cluster_config["region"]
     name = cluster_config["cluster_name"]
+    prometheus_instance_type = cluster_config["prometheus_instance_type"]
     ami_map = json.load(ami_json_file)[K8S_VERSION][region]
 
     eks = {
@@ -309,16 +310,34 @@ def generate_eks(
         "ami": get_ami(ami_map, "t3.medium"),
         "name": "cx-operator",
         "instanceType": "t3.medium",
-        "minSize": 2,
-        "maxSize": 2,
-        "desiredCapacity": 2,
+        "minSize": 1,
+        "maxSize": 25,
+        "desiredCapacity": 1,
         "volumeType": "gp3",
         "volumeSize": 20,
         "volumeIOPS": 3000,
         "volumeThroughput": 125,
+        "labels": {"operator": "true"},
     }
     operator_nodegroup = merge_override(operator_nodegroup, operator_settings)
 
+    prometheus_nodegroup = default_nodegroup(cluster_config)
+    prometheus_settings = {
+        "ami": get_ami(ami_map, prometheus_instance_type),
+        "name": "cx-prometheus",
+        "instanceType": prometheus_instance_type,
+        "minSize": 1,
+        "maxSize": 1,
+        "desiredCapacity": 1,
+        "volumeType": "gp3",
+        "volumeSize": 20,
+        "volumeIOPS": 3000,
+        "volumeThroughput": 125,
+        "labels": {"prometheus": "true"},
+        "taints": {"prometheus": "true:NoSchedule"},
+    }
+    prometheus_nodegroup = merge_override(prometheus_nodegroup, prometheus_settings)
+
     worker_nodegroups = get_all_worker_nodegroups(ami_map, cluster_config)
 
     nat_gateway = "Disable"
@@ -337,7 +356,7 @@ def generate_eks(
             "tags": cluster_config["tags"],
         },
         "vpc": {"nat": {"gateway": nat_gateway}},
-        "nodeGroups": [operator_nodegroup] + worker_nodegroups,
+        "nodeGroups": [operator_nodegroup, prometheus_nodegroup] + worker_nodegroups,
         "addons": [
             {
                 "name": "vpc-cni",
diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2
index 3c975e3890..0fd3bb918e 100644
--- a/manager/manifests/cluster-autoscaler.yaml.j2
+++ b/manager/manifests/cluster-autoscaler.yaml.j2
@@ -181,11 +181,11 @@ spec:
           name: cluster-autoscaler
           resources:
             limits:
-              cpu: 100m
-              memory: 300Mi
+              cpu: 300m
+              memory: 1Gi
             requests:
               cpu: 100m
-              memory: 300Mi
+              memory: 200Mi
           command:
             - ./cluster-autoscaler
             - --v=4
diff --git a/manager/manifests/event-exporter.yaml b/manager/manifests/event-exporter.yaml
index ae0957d9aa..ab4847c4a7 100644
--- a/manager/manifests/event-exporter.yaml
+++ b/manager/manifests/event-exporter.yaml
@@ -82,6 +82,10 @@ spec:
           volumeMounts:
             - mountPath: /data
               name: event-exporter-config
+          resources:
+            requests:
+              cpu: 20m
+              memory: 50Mi
       volumes:
         - name: event-exporter-config
           configMap:
diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2
index a860659bc7..a2e1140f2c 100644
--- a/manager/manifests/fluent-bit.yaml.j2
+++ b/manager/manifests/fluent-bit.yaml.j2
@@ -249,3 +249,6 @@ spec:
         - key: workload
           operator: Exists
           effect: NoSchedule
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml
index 91f10b5a13..83a5b73d32 100644
--- a/manager/manifests/grafana/grafana.yaml
+++ b/manager/manifests/grafana/grafana.yaml
@@ -173,6 +173,12 @@ spec:
         - name: grafana-dashboard-nodes
           configMap:
             name: grafana-dashboard-nodes
+      nodeSelector:
+        prometheus: "true"
+      tolerations:
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
       affinity:
         podAffinity:
           preferredDuringSchedulingIgnoredDuringExecution:
diff --git a/manager/manifests/inferentia.yaml b/manager/manifests/inferentia.yaml
index eb6a7db974..51331f6715 100644
--- a/manager/manifests/inferentia.yaml
+++ b/manager/manifests/inferentia.yaml
@@ -140,7 +140,7 @@ spec:
               value: "12345"
           resources:
             requests:
-              cpu: 100m
+              cpu: 50m
               memory: 100Mi
 
 ---
diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2
index 9df2d198a5..5b4316ca8e 100644
--- a/manager/manifests/istio.yaml.j2
+++ b/manager/manifests/istio.yaml.j2
@@ -25,8 +25,8 @@ spec:
       k8s:
         resources:
           requests:
-            cpu: 200m  # default is 500m
-            memory: 1.75Gi  # default is 2048Mi == 2Gi
+            cpu: 100m  # default is 500m
+            memory: 200Mi  # default is 2048Mi == 2Gi
     cni:
       enabled: false
     ingressGateways:
@@ -72,7 +72,7 @@ spec:
               cpu: 100m
               memory: 128Mi
             limits:
-              cpu: 2000m
+              cpu: 1000m
               memory: 1024Mi
           replicaCount: 1
           hpaSpec:
@@ -128,20 +128,24 @@ spec:
                 targetPort: 15443
           resources:
             requests:
-              cpu: 200m
+              cpu: 512m
               memory: 128Mi
             limits:
-              cpu: 2000m
+              cpu: 1500m
               memory: 1024Mi
           replicaCount: 1
           hpaSpec:
             minReplicas: 1
-            maxReplicas: 1  # edit autoscaleEnabled in values if increasing this
+            maxReplicas: 100  # edit autoscaleEnabled in values if increasing this
             metrics:
               - type: Resource
                 resource:
                   name: cpu
-                  targetAverageUtilization: 80
+                  targetAverageUtilization: 90
+              - type: Resource
+                resource:
+                  name: mem
+                  targetAverageUtilization: 90
             scaleTargetRef:
               apiVersion: apps/v1
               kind: Deployment
@@ -159,7 +163,7 @@ spec:
     gateways:
       istio-ingressgateway:
         runAsRoot: true
-        autoscaleEnabled: false
+        autoscaleEnabled: true
         secretVolumes:
         - name: customgateway-certs
           secretName: istio-customgateway-certs
diff --git a/manager/manifests/metrics-server.yaml b/manager/manifests/metrics-server.yaml
index bdaad365d9..6e4e0de8bb 100644
--- a/manager/manifests/metrics-server.yaml
+++ b/manager/manifests/metrics-server.yaml
@@ -174,8 +174,11 @@ spec:
             periodSeconds: 10
           resources:
             requests:
-              cpu: 100m
-              memory: 200Mi
+              cpu: 50m
+              memory: 100Mi
+            limits:
+              cpu: 200m
+              memory: 500Mi
           securityContext:
             readOnlyRootFilesystem: true
             runAsNonRoot: true
diff --git a/manager/manifests/operator.yaml.j2 b/manager/manifests/operator.yaml.j2
index c5501cb1d8..3ccd8eea76 100644
--- a/manager/manifests/operator.yaml.j2
+++ b/manager/manifests/operator.yaml.j2
@@ -58,10 +58,10 @@ spec:
           imagePullPolicy: Always
           resources:
             requests:
-              cpu: 200m
+              cpu: 100m
               memory: 128Mi
             limits:
-              cpu: 2000m
+              cpu: 1500m
               memory: 1024Mi
           ports:
             - containerPort: 8888
diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml
index 98ad1c0006..8b37d969dd 100644
--- a/manager/manifests/prometheus-dcgm-exporter.yaml
+++ b/manager/manifests/prometheus-dcgm-exporter.yaml
@@ -122,6 +122,16 @@ spec:
       path: /metrics
       scheme: http
       interval: 15s
+      metricRelabelings:
+      - action: keep
+        sourceLabels: [__name__]
+        regex: "DCGM_FI_DEV_(\
+          GPU_UTIL|\
+          FB_USED|\
+          FB_FREE\
+          )"
+      - action: labelkeep
+        regex: (__name__|exported_pod)
   namespaceSelector:
     any: true
   selector:
diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml
index d98bfb13c9..ba5165ff6e 100644
--- a/manager/manifests/prometheus-kube-state-metrics.yaml
+++ b/manager/manifests/prometheus-kube-state-metrics.yaml
@@ -234,6 +234,12 @@ spec:
             port: 8080
           initialDelaySeconds: 5
           timeoutSeconds: 5
+      nodeSelector:
+        prometheus: "true"
+      tolerations:
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
@@ -250,6 +256,17 @@ spec:
       scheme: http
       path: /metrics
       interval: 30s
+      metricRelabelings:
+      - action: keep
+        sourceLabels: [__name__]
+        regex: "kube_(\
+          pod_container_resource_requests|\
+          pod_info|\
+          deployment_status_replicas_available|\
+          job_status_active\
+          )"
+      - action: labelkeep
+        regex: (__name__|exported_pod|exported_container|job_name|resource)
   namespaceSelector:
     any: true
   selector:
diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml
index 3a8ff44f3b..8982706c42 100644
--- a/manager/manifests/prometheus-kubelet-exporter.yaml
+++ b/manager/manifests/prometheus-kubelet-exporter.yaml
@@ -27,37 +27,7 @@ spec:
     interval: 30s
     metricRelabelings:
     - action: drop
-      regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: transformation_(transformation_latencies_microseconds|failures_total)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
-      sourceLabels:
-      - __name__
+      sourceLabels: [__name__]
     port: https-metrics
     relabelings:
     - sourceLabels:
@@ -71,10 +41,14 @@ spec:
     honorTimestamps: false
     interval: 30s
     metricRelabelings:
-    - action: drop
-      regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
-      sourceLabels:
-      - __name__
+    - action: keep
+      sourceLabels: [__name__]
+      regex: "container_(\
+        cpu_usage_seconds_total|\
+        memory_working_set_bytes\
+        )"
+    - action: labelkeep
+      regex: (__name__|pod|container|name)
     path: /metrics/cadvisor
     port: https-metrics
     relabelings:
@@ -93,6 +67,9 @@ spec:
     - sourceLabels:
       - __metrics_path__
       targetLabel: metrics_path
+    metricRelabelings:
+    - action: drop
+      sourceLabels: [__name__]
     scheme: https
     tlsConfig:
       insecureSkipVerify: true
diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index fa9aefe277..b0d76fbc34 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -30,6 +30,12 @@ metadata:
 spec:
   image: {{ config['image_prometheus'] }}
   serviceAccountName: prometheus
+  nodeSelector:
+    prometheus: "true"
+  tolerations:
+    - key: prometheus
+      operator: Exists
+      effect: NoSchedule
   podMonitorSelector:
     matchExpressions:
       - key: "monitoring.cortex.dev"
@@ -160,6 +166,17 @@ spec:
         - sourceLabels: [ __meta_kubernetes_pod_name ]
           action: replace
           targetLabel: pod_name
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "istio_(\
+            requests_total|\
+            request_duration_milliseconds_bucket|\
+            request_duration_milliseconds_sum|\
+            request_duration_milliseconds_count\
+            )"
+        - action: labelkeep
+          regex: (__name__|destination_service|response_code|le)
 
 ---
 
@@ -209,6 +226,10 @@ spec:
         - sourceLabels: [ __meta_kubernetes_pod_name ]
           action: replace
           targetLabel: pod_name
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "cortex_(.+)"
 
 ---
 
@@ -259,6 +280,10 @@ spec:
         - sourceLabels: [ __meta_kubernetes_pod_name ]
           action: replace
           targetLabel: pod_name
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "cortex_(.+)"
 
 ---
 
@@ -276,6 +301,10 @@ spec:
       scheme: http
       path: /metrics
       interval: 20s
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "cortex_(.+)"
   namespaceSelector:
     any: true
   selector:
@@ -298,6 +327,10 @@ spec:
       scheme: http
       path: /metrics
       interval: 10s
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "cortex_(.+)"
   namespaceSelector:
     any: true
   selector:
diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml
index d6f97170a1..9c3a483f81 100644
--- a/manager/manifests/prometheus-node-exporter.yaml
+++ b/manager/manifests/prometheus-node-exporter.yaml
@@ -115,7 +115,7 @@ spec:
               cpu: 250m
               memory: 180Mi
             requests:
-              cpu: 100m
+              cpu: 40m
               memory: 180Mi
           volumeMounts:
             - mountPath: /host/sys
@@ -194,6 +194,34 @@ spec:
           sourceLabels:
             - __meta_kubernetes_pod_node_name
           targetLabel: instance
+      metricRelabelings:
+        - action: keep
+          sourceLabels: [__name__]
+          regex: "node_(\
+            cpu_seconds_total|\
+            load1|\
+            load5|\
+            load15|\
+            exporter_build_info|\
+            memory_MemTotal_bytes|\
+            memory_MemFree_bytes|\
+            memory_Buffers_bytes|\
+            memory_Cached_bytes|\
+            memory_MemAvailable_bytes|\
+            disk_read_bytes_total|\
+            disk_written_bytes_total|\
+            disk_io_time_seconds_total|\
+            disk_io_time_weighted_seconds_total|\
+            filesystem_size_bytes|\
+            filesystem_avail_bytes|\
+            network_receive_bytes_total|\
+            network_transmit_bytes_total|\
+            network_receive_drop_total|\
+            network_transmit_drop_total|\
+            vmstat_pgmajfault\
+            )"
+        - action: labelkeep
+          regex: (__name__|instance|job|device|fstype|mountpoint|mode)
       scheme: https
       tlsConfig:
         insecureSkipVerify: true
diff --git a/manager/manifests/prometheus-operator.yaml b/manager/manifests/prometheus-operator.yaml
index ddeff6e1c9..3b7b558318 100644
--- a/manager/manifests/prometheus-operator.yaml
+++ b/manager/manifests/prometheus-operator.yaml
@@ -14199,6 +14199,11 @@ spec:
           allowPrivilegeEscalation: false
       nodeSelector:
         kubernetes.io/os: linux
+        prometheus: "true"
+      tolerations:
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
       securityContext:
         runAsNonRoot: true
         runAsUser: 65534
diff --git a/manager/manifests/prometheus-statsd-exporter.yaml b/manager/manifests/prometheus-statsd-exporter.yaml
index b96a700ea6..ea58db52d8 100644
--- a/manager/manifests/prometheus-statsd-exporter.yaml
+++ b/manager/manifests/prometheus-statsd-exporter.yaml
@@ -74,6 +74,12 @@ spec:
           volumeMounts:
             - name: statsd-mapping-config
               mountPath: /etc/prometheus-statsd-exporter
+      nodeSelector:
+        prometheus: "true"
+      tolerations:
+        - key: prometheus
+          operator: Exists
+          effect: NoSchedule
       volumes:
         - name: statsd-mapping-config
           configMap:
diff --git a/pkg/crds/config/manager/manager.yaml b/pkg/crds/config/manager/manager.yaml
index c1a9c7d25c..48fb8ad0e1 100644
--- a/pkg/crds/config/manager/manager.yaml
+++ b/pkg/crds/config/manager/manager.yaml
@@ -46,11 +46,11 @@ spec:
           periodSeconds: 10
         resources:
           limits:
-            cpu: 100m
-            memory: 30Mi
+            cpu: 300m
+            memory: 100Mi
           requests:
             cpu: 100m
-            memory: 20Mi
+            memory: 80Mi
         volumeMounts:
           - mountPath: /mnt/cluster.yaml
             name: cluster-config
diff --git a/pkg/lib/aws/ec2.go b/pkg/lib/aws/ec2.go
index 338a719c91..66b2f94fb6 100644
--- a/pkg/lib/aws/ec2.go
+++ b/pkg/lib/aws/ec2.go
@@ -137,6 +137,46 @@ func IsAMDGPUInstance(instanceType string) (bool, error) {
 	return false, nil
 }
 
+func IsNvidiaGPUInstance(instanceType string) (bool, error) {
+	parsedType, err := ParseInstanceType(instanceType)
+	if err != nil {
+		return false, err
+	}
+
+	if !_gpuInstanceFamilies.Has(parsedType.Family) {
+		return false, nil
+	}
+
+	if !parsedType.Capabilities.Has("a") {
+		return true, nil
+	}
+
+	return false, nil
+}
+
+func IsGPUInstance(instanceType string) (bool, error) {
+	isAMDGPU, err := IsAMDGPUInstance(instanceType)
+	if err != nil {
+		return false, err
+	}
+
+	isNvidiaGPU, err := IsNvidiaGPUInstance(instanceType)
+	if err != nil {
+		return false, err
+	}
+
+	return isAMDGPU || isNvidiaGPU, nil
+}
+
+func IsInferentiaInstance(instanceType string) (bool, error) {
+	parsedType, err := ParseInstanceType(instanceType)
+	if err != nil {
+		return false, err
+	}
+
+	return parsedType.Family == "inf", nil
+}
+
 func IsMacInstance(instanceType string) (bool, error) {
 	parsedType, err := ParseInstanceType(instanceType)
 	if err != nil {
diff --git a/pkg/lib/aws/servicequotas.go b/pkg/lib/aws/servicequotas.go
index ef99e1787b..d1bc45b867 100644
--- a/pkg/lib/aws/servicequotas.go
+++ b/pkg/lib/aws/servicequotas.go
@@ -328,9 +328,9 @@ func requiredRulesForNodeGroupSecurityGroup(numAZs, whitelistLength int) int {
 }
 
 func requiredRulesForControlPlaneSecurityGroup(numNodeGroups int) int {
-	// +1 for the operator node group
+	// +2 for the operator and prometheus node groups
 	// this is the number of outbound rules (there are half as many inbound rules, so that is not the limiting factor)
-	return 2 * (numNodeGroups + 1)
+	return 2 * (numNodeGroups + 2)
 }
 
 func requiredSecurityGroups(numNodeGroups int, clusterAlreadyExists bool) int {
diff --git a/pkg/operator/endpoints/info.go b/pkg/operator/endpoints/info.go
index b83a6374e0..c210e74fe7 100644
--- a/pkg/operator/endpoints/info.go
+++ b/pkg/operator/endpoints/info.go
@@ -31,7 +31,13 @@ import (
 )
 
 func Info(w http.ResponseWriter, r *http.Request) {
-	nodeInfos, numPendingReplicas, err := getNodeInfos()
+	workerNodeInfos, numPendingReplicas, err := getWorkerNodeInfos()
+	if err != nil {
+		respondError(w, r, err)
+		return
+	}
+
+	operatorNodeInfos, err := getOperatorNodeInfos()
 	if err != nil {
 		respondError(w, r, err)
 		return
@@ -44,13 +50,14 @@ func Info(w http.ResponseWriter, r *http.Request) {
 
 	response := schema.InfoResponse{
 		ClusterConfig:      fullClusterConfig,
-		NodeInfos:          nodeInfos,
+		WorkerNodeInfos:    workerNodeInfos,
+		OperatorNodeInfos:  operatorNodeInfos,
 		NumPendingReplicas: numPendingReplicas,
 	}
 	respondJSON(w, r, response)
 }
 
-func getNodeInfos() ([]schema.NodeInfo, int, error) {
+func getWorkerNodeInfos() ([]schema.WorkerNodeInfo, int, error) {
 	pods, err := config.K8sAllNamspaces.ListPods(nil)
 	if err != nil {
 		return nil, 0, err
@@ -61,8 +68,8 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
 		return nil, 0, err
 	}
 
-	nodeInfoMap := make(map[string]*schema.NodeInfo, len(nodes)) // node name -> info
-	spotPriceCache := make(map[string]float64)                   // instance type -> spot price
+	nodeInfoMap := make(map[string]*schema.WorkerNodeInfo, len(nodes)) // node name -> info
+	spotPriceCache := make(map[string]float64)                         // instance type -> spot price
 
 	for i := range nodes {
 		node := nodes[i]
@@ -86,12 +93,14 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
 			}
 		}
 
-		nodeInfoMap[node.Name] = &schema.NodeInfo{
+		nodeInfoMap[node.Name] = &schema.WorkerNodeInfo{
+			NodeInfo: schema.NodeInfo{
+				NodeGroupName: nodeGroupName,
+				InstanceType:  instanceType,
+				IsSpot:        isSpot,
+				Price:         price,
+			},
 			Name:                 node.Name,
-			NodeGroupName:        nodeGroupName,
-			InstanceType:         instanceType,
-			IsSpot:               isSpot,
-			Price:                price,
 			NumReplicas:          0,                             // will be added to below
 			ComputeUserCapacity:  nodeComputeAllocatable(&node), // will be subtracted from below
 			ComputeAvailable:     nodeComputeAllocatable(&node), // will be subtracted from below
@@ -160,7 +169,7 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
 
 	sort.Strings(nodeNames)
 
-	nodeInfos := make([]schema.NodeInfo, len(nodeNames))
+	nodeInfos := make([]schema.WorkerNodeInfo, len(nodeNames))
 	for i, nodeName := range nodeNames {
 		nodeInfos[i] = *nodeInfoMap[nodeName]
 	}
@@ -179,3 +188,41 @@ func nodeComputeAllocatable(node *kcore.Node) userconfig.Compute {
 		Inf: infQty.Value(),
 	}
 }
+
+func getOperatorNodeInfos() ([]schema.NodeInfo, error) {
+	nodes, err := config.K8sAllNamspaces.ListNodesByLabel("operator", "true")
+	if err != nil {
+		return nil, err
+	}
+
+	nodeInfoMap := make(map[string]*schema.NodeInfo, len(nodes)) // node name -> info
+
+	for i := range nodes {
+		node := nodes[i]
+
+		instanceType := node.Labels["beta.kubernetes.io/instance-type"]
+		nodeGroupName := node.Labels["alpha.eksctl.io/nodegroup-name"]
+
+		price := aws.InstanceMetadatas[config.ClusterConfig.Region][instanceType].Price
+
+		nodeInfoMap[node.Name] = &schema.NodeInfo{
+			NodeGroupName: nodeGroupName,
+			InstanceType:  instanceType,
+			Price:         price,
+		}
+	}
+
+	nodeNames := make([]string, 0, len(nodeInfoMap))
+	for nodeName := range nodeInfoMap {
+		nodeNames = append(nodeNames, nodeName)
+	}
+
+	sort.Strings(nodeNames)
+
+	nodeInfos := make([]schema.NodeInfo, len(nodeNames))
+	for i, nodeName := range nodeNames {
+		nodeInfos[i] = *nodeInfoMap[nodeName]
+	}
+
+	return nodeInfos, nil
+}
diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go
index 18988fa277..d86073e4e7 100644
--- a/pkg/operator/schema/schema.go
+++ b/pkg/operator/schema/schema.go
@@ -26,16 +26,14 @@ import (
 
 type InfoResponse struct {
 	ClusterConfig      clusterconfig.InternalConfig `json:"cluster_config" yaml:"cluster_config"`
-	NodeInfos          []NodeInfo                   `json:"node_infos" yaml:"node_infos"`
+	WorkerNodeInfos    []WorkerNodeInfo             `json:"worker_node_infos" yaml:"worker_node_infos"`
+	OperatorNodeInfos  []NodeInfo                   `json:"operator_node_infos" yaml:"operator_node_infos"`
 	NumPendingReplicas int                          `json:"num_pending_replicas" yaml:"num_pending_replicas"`
 }
 
-type NodeInfo struct {
+type WorkerNodeInfo struct {
+	NodeInfo
 	Name                    string             `json:"name" yaml:"name"`
-	NodeGroupName           string             `json:"nodegroup_name" yaml:"nodegroup_name"`
-	InstanceType            string             `json:"instance_type" yaml:"instance_type"`
-	IsSpot                  bool               `json:"is_spot" yaml:"is_spot"`
-	Price                   float64            `json:"price" yaml:"price"`
 	NumReplicas             int                `json:"num_replicas" yaml:"num_replicas"`
 	NumAsyncGatewayReplicas int                `json:"num_async_gateway_replicas" yaml:"num_async_gateway_replicas"`
 	NumEnqueuerReplicas     int                `json:"num_enqueuer_replicas" yaml:"num_enqueuer_replicas"`
@@ -44,6 +42,13 @@ type NodeInfo struct {
 	ComputeUserRequested    userconfig.Compute `json:"compute_user_requested" yaml:"compute_user_requested"` // total resources requested by user on a node
 }
 
+type NodeInfo struct {
+	NodeGroupName string  `json:"nodegroup_name" yaml:"nodegroup_name"`
+	InstanceType  string  `json:"instance_type" yaml:"instance_type"`
+	IsSpot        bool    `json:"is_spot" yaml:"is_spot"`
+	Price         float64 `json:"price" yaml:"price"`
+}
+
 type DeployResult struct {
 	API     *APIResponse `json:"api"`
 	Message string       `json:"message"`
@@ -97,9 +102,9 @@ type APIVersion struct {
 
 type VerifyCortexResponse struct{}
 
-func (ir InfoResponse) GetNodesWithNodeGroupName(ngName string) []NodeInfo {
-	nodesInfo := []NodeInfo{}
-	for _, nodeInfo := range ir.NodeInfos {
+func (ir InfoResponse) GetNodesWithNodeGroupName(ngName string) []WorkerNodeInfo {
+	nodesInfo := []WorkerNodeInfo{}
+	for _, nodeInfo := range ir.WorkerNodeInfos {
 		if nodeInfo.NodeGroupName == ngName {
 			nodesInfo = append(nodesInfo, nodeInfo)
 		}
diff --git a/pkg/types/clusterconfig/cluster_config.go b/pkg/types/clusterconfig/cluster_config.go
index 6c56ffd485..c8d0d13124 100644
--- a/pkg/types/clusterconfig/cluster_config.go
+++ b/pkg/types/clusterconfig/cluster_config.go
@@ -56,6 +56,8 @@ const (
 )
 
 var (
+	_operatorNodeGroupInstanceType = "t3.medium"
+
 	_maxNodeGroupLengthWithPrefix = 32
 	_maxNodeGroupLength           = _maxNodeGroupLengthWithPrefix - len("cx-wd-") // or cx-ws-
 	_maxInstancePools             = 20
@@ -84,8 +86,9 @@ type CoreConfig struct {
 	IstioNamespace string `json:"istio_namespace" yaml:"istio_namespace"`
 
 	// User-specifiable fields
-	ClusterName string `json:"cluster_name" yaml:"cluster_name"`
-	Region      string `json:"region" yaml:"region"`
+	ClusterName            string `json:"cluster_name" yaml:"cluster_name"`
+	Region                 string `json:"region" yaml:"region"`
+	PrometheusInstanceType string `json:"prometheus_instance_type" yaml:"prometheus_instance_type"`
 
 	// User-specifiable fields
 	ImageOperator                   string `json:"image_operator" yaml:"image_operator"`
@@ -333,6 +336,14 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{
 			Validator: RegionValidator,
 		},
 	},
+	{
+		StructField: "PrometheusInstanceType",
+		StringValidation: &cr.StringValidation{
+			MinLength: 1,
+			Default:   "t3.medium",
+			Validator: validatePrometheusInstanceType,
+		},
+	},
 	{
 		StructField: "Telemetry",
 		BoolValidation: &cr.BoolValidation{
@@ -906,7 +917,16 @@ func (cc *Config) validate(awsClient *aws.Client) error {
 	}
 
 	ngNames := []string{}
-	instances := []aws.InstanceTypeRequests{}
+	instances := []aws.InstanceTypeRequests{
+		{
+			InstanceType:              _operatorNodeGroupInstanceType,
+			RequiredOnDemandInstances: 1,
+		},
+		{
+			InstanceType:              cc.PrometheusInstanceType,
+			RequiredOnDemandInstances: 1,
+		},
+	}
 	for _, nodeGroup := range cc.NodeGroups {
 		if !slices.HasString(ngNames, nodeGroup.Name) {
 			ngNames = append(ngNames, nodeGroup.Name)
@@ -1494,6 +1514,31 @@ func validateInstanceType(instanceType string) (string, error) {
 	return instanceType, nil
 }
 
+func validatePrometheusInstanceType(instanceType string) (string, error) {
+	_, err := validateInstanceType(instanceType)
+	if err != nil {
+		return "", err
+	}
+
+	isGPU, err := aws.IsGPUInstance(instanceType)
+	if err != nil {
+		return "", err
+	}
+	if isGPU {
+		return "", ErrorGPUInstancesNotSupported(instanceType)
+	}
+
+	isInf, err := aws.IsInferentiaInstance(instanceType)
+	if err != nil {
+		return "", err
+	}
+	if isInf {
+		return "", ErrorInferentiaInstancesNotSupported(instanceType)
+	}
+
+	return instanceType, nil
+}
+
 func validateInstanceDistribution(instances []string) ([]string, error) {
 	for _, instance := range instances {
 		_, err := validateInstanceType(instance)
@@ -1635,6 +1680,7 @@ func (cc *CoreConfig) TelemetryEvent() map[string]interface{} {
 	}
 
 	event["region"] = cc.Region
+	event["prometheus_instance_type"] = cc.PrometheusInstanceType
 
 	if !strings.HasPrefix(cc.ImageOperator, "quay.io/cortexlabs/") {
 		event["image_operator._is_custom"] = true
diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go
index bbb7e6dc15..d277e8af17 100644
--- a/pkg/types/clusterconfig/config_key.go
+++ b/pkg/types/clusterconfig/config_key.go
@@ -22,6 +22,7 @@ const (
 
 	ClusterNameKey                         = "cluster_name"
 	RegionKey                              = "region"
+	PrometheusInstanceTypeKey              = "prometheus_instance_type"
 	NodeGroupsKey                          = "node_groups"
 	InstanceTypeKey                        = "instance_type"
 	AcceleratorTypeKey                     = "accelerator_type"
diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go
index 5c00328654..56d1be5115 100644
--- a/pkg/types/clusterconfig/errors.go
+++ b/pkg/types/clusterconfig/errors.go
@@ -46,6 +46,8 @@ const (
 	ErrSpotPriceGreaterThanMaxPrice           = "clusterconfig.spot_price_greater_than_max_price"
 	ErrInstanceTypeNotSupportedByCortex       = "clusterconfig.instance_type_not_supported_by_cortex"
 	ErrAMDGPUInstancesNotSupported            = "clusterconfig.amd_gpu_instances_not_supported"
+	ErrGPUInstancesNotSupported               = "clusterconfig.gpu_instance_not_supported"
+	ErrInferentiaInstancesNotSupported        = "clusterconfig.inferentia_instances_not_supported"
 	ErrMacInstancesNotSupported               = "clusterconfig.mac_instances_not_supported"
 	ErrAtLeastOneInstanceDistribution         = "clusterconfig.at_least_one_instance_distribution"
 	ErrNoCompatibleSpotInstanceFound          = "clusterconfig.no_compatible_spot_instance_found"
@@ -202,6 +204,20 @@ func ErrorAMDGPUInstancesNotSupported(instanceType string) error {
 	})
 }
 
+func ErrorGPUInstancesNotSupported(instanceType string) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrGPUInstancesNotSupported,
+		Message: fmt.Sprintf("GPU instances (including %s) are not supported", instanceType),
+	})
+}
+
+func ErrorInferentiaInstancesNotSupported(instanceType string) error {
+	return errors.WithStack(&errors.Error{
+		Kind:    ErrInferentiaInstancesNotSupported,
+		Message: fmt.Sprintf("Inferentia instances (including %s) are not supported", instanceType),
+	})
+}
+
 func ErrorMacInstancesNotSupported(instanceType string) error {
 	return errors.WithStack(&errors.Error{
 		Kind:    ErrMacInstancesNotSupported,