diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 3311c60a01..f89ba386f3 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -358,7 +358,7 @@ var _clusterConfigureCmd = &cobra.Command{ exit.Error(err) } - k8sClient, err := k8s.New("default", false, restConfig, scheme) + k8sClient, err := k8s.New(consts.DefaultNamespace, false, restConfig, scheme) if err != nil { exit.Error(err) } @@ -804,7 +804,7 @@ var _clusterHealthCmd = &cobra.Command{ exit.Error(err) } - k8sClient, err := k8s.New("default", false, restConfig, scheme) + k8sClient, err := k8s.New(consts.DefaultNamespace, false, restConfig, scheme) if err != nil { exit.Error(err) } diff --git a/manager/install.sh b/manager/install.sh index 7d63049871..5286c4d9ed 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -36,6 +36,7 @@ function cluster_up() { create_eks echo -n "○ updating cluster configuration " + setup_namespaces setup_configmap echo "✓" @@ -195,6 +196,12 @@ function write_kubeconfig() { out=$(kubectl get pods 2>&1 || true); if [[ "$out" == *"must be logged in to the server"* ]]; then echo "error: your aws iam user does not have access to this cluster; to grant access, see https://docs.cortex.dev/v/${CORTEX_VERSION_MINOR}/"; exit 1; fi } +function setup_namespaces() { + # doing a patch to prevent getting the kubectl.kubernetes.io/last-applied-configuration annotation warning + kubectl patch namespace default -p '{"metadata": {"labels": {"istio-discovery": "enabled"}}}' >/dev/null + kubectl apply -f manifests/namespaces.yaml >/dev/null +} + function setup_configmap() { envsubst < manifests/default_cortex_cli_config.yaml > tmp_cli_config.yaml kubectl -n=default create configmap 'client-config' \ @@ -227,7 +234,9 @@ function setup_prometheus() { envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml - kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml + if ! kubectl get secret -n prometheus additional-scrape-configs >/dev/null 2>&1; then + kubectl create secret generic -n prometheus additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml > /dev/null + fi } function setup_grafana() { @@ -360,8 +369,6 @@ function remove_nodegroups() { } function setup_istio() { - envsubst < manifests/istio-namespace.yaml | kubectl apply -f - >/dev/null - if ! grep -q "istio-customgateway-certs" <<< $(kubectl get secret -n istio-system); then WEBSITE=localhost openssl req -subj "/C=US/CN=$WEBSITE" -newkey rsa:2048 -nodes -keyout $WEBSITE.key -x509 -days 3650 -out $WEBSITE.crt >/dev/null 2>&1 @@ -530,8 +537,8 @@ function validate_cortex() { fi if [ "$prometheus_ready" == "" ]; then - readyReplicas=$(kubectl get statefulset -n default prometheus-prometheus -o jsonpath='{.status.readyReplicas}' 2> /dev/null) - desiredReplicas=$(kubectl get statefulset -n default prometheus-prometheus -o jsonpath='{.status.replicas}' 2> /dev/null) + readyReplicas=$(kubectl get statefulset -n prometheus prometheus-prometheus -o jsonpath='{.status.readyReplicas}' 2> /dev/null) + desiredReplicas=$(kubectl get statefulset -n prometheus prometheus-prometheus -o jsonpath='{.status.replicas}' 2> /dev/null) if [ "$readyReplicas" != "" ] && [ "$desiredReplicas" != "" ]; then if [ "$readyReplicas" == "$desiredReplicas" ]; then diff --git a/manager/manifests/autoscaler.yaml.j2 b/manager/manifests/autoscaler.yaml.j2 index df42a90965..842552f31a 100644 --- a/manager/manifests/autoscaler.yaml.j2 +++ b/manager/manifests/autoscaler.yaml.j2 @@ -82,7 +82,7 @@ spec: args: - "--in-cluster" - "--port=8000" - - "--prometheus-url=http://prometheus.default:9090" + - "--prometheus-url=http://prometheus.prometheus:9090" - "--namespace=default" - "--cluster-config=/configs/cluster/cluster.yaml" ports: diff --git a/manager/manifests/event-exporter.yaml b/manager/manifests/event-exporter.yaml index ab4847c4a7..8ff19efb09 100644 --- a/manager/manifests/event-exporter.yaml +++ b/manager/manifests/event-exporter.yaml @@ -15,7 +15,7 @@ apiVersion: v1 kind: ServiceAccount metadata: - namespace: default + namespace: logging name: event-exporter --- @@ -30,7 +30,7 @@ roleRef: name: view subjects: - kind: ServiceAccount - namespace: default + namespace: logging name: event-exporter --- @@ -39,7 +39,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: event-exporter-config - namespace: default + namespace: logging data: config.yaml: | logLevel: error @@ -61,7 +61,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: event-exporter - namespace: default + namespace: logging spec: replicas: 1 selector: diff --git a/manager/manifests/fluent-bit.yaml.j2 b/manager/manifests/fluent-bit.yaml.j2 index a2e1140f2c..03fef37062 100644 --- a/manager/manifests/fluent-bit.yaml.j2 +++ b/manager/manifests/fluent-bit.yaml.j2 @@ -16,7 +16,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: fluent-bit - namespace: default + namespace: logging --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -40,13 +40,13 @@ roleRef: subjects: - kind: ServiceAccount name: fluent-bit - namespace: default + namespace: logging --- apiVersion: v1 kind: ConfigMap metadata: name: fluent-bit-config - namespace: default + namespace: logging labels: k8s-app: fluent-bit data: @@ -186,7 +186,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: fluent-bit - namespace: default + namespace: logging spec: selector: matchLabels: diff --git a/manager/manifests/grafana/grafana.yaml.j2 b/manager/manifests/grafana/grafana.yaml.j2 index ec863d3fb5..1cd9052393 100644 --- a/manager/manifests/grafana/grafana.yaml.j2 +++ b/manager/manifests/grafana/grafana.yaml.j2 @@ -28,7 +28,7 @@ data: "name": "prometheus", "orgId": 1, "type": "prometheus", - "url": "http://prometheus.default:9090", + "url": "http://prometheus.prometheus:9090", "version": 1, "isDefault": true } diff --git a/manager/manifests/istio.yaml.j2 b/manager/manifests/istio.yaml.j2 index ecccf38695..55b186cabf 100644 --- a/manager/manifests/istio.yaml.j2 +++ b/manager/manifests/istio.yaml.j2 @@ -18,6 +18,10 @@ spec: profile: minimal hub: {{ env['CORTEX_IMAGE_ISTIO_PROXY_HUB'] }} # this is only used by proxy, since pilot overrides it (proxy doesn't have dedicated hub config) tag: {{ env['CORTEX_IMAGE_ISTIO_PROXY_TAG'] }} # this is only used by proxy, since pilot overrides it (proxy doesn't have dedicated tag config) + meshConfig: + discoverySelectors: + - matchLabels: + istio-discovery: enabled components: pilot: # "pilot" refers to the istiod container hub: {{ env['CORTEX_IMAGE_ISTIO_PILOT_HUB'] }} @@ -26,7 +30,23 @@ spec: resources: requests: cpu: 100m # default is 500m - memory: 200Mi # default is 2048Mi == 2Gi + memory: 700Mi # default is 2048Mi == 2Gi + hpaSpec: + minReplicas: 1 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + targetAverageUtilization: 90 + - type: Resource + resource: + name: memory + targetAverageUtilization: 90 + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: istiod cni: enabled: false ingressGateways: @@ -71,7 +91,7 @@ spec: replicaCount: 1 hpaSpec: minReplicas: 1 - maxReplicas: 1 # edit autoscaleEnabled in values if increasing this + maxReplicas: 1 metrics: - type: Resource resource: @@ -124,7 +144,7 @@ spec: replicaCount: 1 hpaSpec: minReplicas: 1 - maxReplicas: 100 # edit autoscaleEnabled in values if increasing this + maxReplicas: 100 metrics: - type: Resource resource: diff --git a/manager/manifests/istio-namespace.yaml b/manager/manifests/namespaces.yaml similarity index 83% rename from manager/manifests/istio-namespace.yaml rename to manager/manifests/namespaces.yaml index 3f5ce71534..ce1959e923 100644 --- a/manager/manifests/istio-namespace.yaml +++ b/manager/manifests/namespaces.yaml @@ -16,3 +16,16 @@ apiVersion: v1 kind: Namespace metadata: name: istio-system +--- + +apiVersion: v1 +kind: Namespace +metadata: + name: logging +--- + +apiVersion: v1 +kind: Namespace +metadata: + name: prometheus +--- diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml index 8b37d969dd..96d82a5644 100644 --- a/manager/manifests/prometheus-dcgm-exporter.yaml +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -12,16 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -apiVersion: v1 -kind: Namespace -metadata: - name: monitoring ---- apiVersion: v1 kind: ServiceAccount metadata: name: dcgm-exporter - namespace: default + namespace: prometheus labels: app.kubernetes.io/name: dcgm-exporter app.kubernetes.io/instance: dcgm-exporter @@ -31,7 +26,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: dcgm-exporter - namespace: default + namespace: prometheus labels: app.kubernetes.io/name: dcgm-exporter app.kubernetes.io/instance: dcgm-exporter @@ -106,7 +101,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: dcgm-exporter - namespace: default + namespace: prometheus labels: monitoring.cortex.dev: dcgm-exporter app.kubernetes.io/name: dcgm-exporter diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml index edf69cd7ba..89da6c4842 100644 --- a/manager/manifests/prometheus-kube-state-metrics.yaml +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -17,7 +17,7 @@ metadata: labels: app.kubernetes.io/name: kube-state-metrics name: kube-state-metrics - namespace: default + namespace: prometheus --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -180,13 +180,13 @@ roleRef: subjects: - kind: ServiceAccount name: kube-state-metrics - namespace: default + namespace: prometheus --- apiVersion: apps/v1 kind: Deployment metadata: name: kube-state-metrics - namespace: default + namespace: prometheus labels: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/version: "2.1.0" @@ -245,7 +245,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: kube-state-metrics - namespace: default + namespace: prometheus labels: name: kube-state-metrics monitoring.cortex.dev: kube-state-metrics diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml index 8982706c42..87855746a0 100644 --- a/manager/manifests/prometheus-kubelet-exporter.yaml +++ b/manager/manifests/prometheus-kubelet-exporter.yaml @@ -19,7 +19,7 @@ metadata: k8s-app: kubelet monitoring.cortex.dev: kubelet-exporter name: kubelet - namespace: default + namespace: prometheus spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token diff --git a/manager/manifests/prometheus-monitoring.yaml b/manager/manifests/prometheus-monitoring.yaml index 0982504aff..21b85a851a 100644 --- a/manager/manifests/prometheus-monitoring.yaml +++ b/manager/manifests/prometheus-monitoring.yaml @@ -27,6 +27,7 @@ apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: name: prometheus + namespace: prometheus spec: image: $CORTEX_IMAGE_PROMETHEUS serviceAccountName: prometheus @@ -73,6 +74,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: prometheus + namespace: prometheus --- @@ -114,7 +116,7 @@ roleRef: subjects: - kind: ServiceAccount name: prometheus - namespace: default + namespace: prometheus --- @@ -122,6 +124,7 @@ apiVersion: v1 kind: Service metadata: name: prometheus + namespace: prometheus spec: type: ClusterIP ports: @@ -136,6 +139,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: istio-stats + namespace: prometheus labels: monitoring.cortex.dev: "istio" spec: @@ -187,6 +191,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: proxy-stats + namespace: prometheus labels: monitoring.cortex.dev: "proxy" spec: @@ -240,6 +245,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: async-stats + namespace: prometheus labels: monitoring.cortex.dev: "dequeuer-async" spec: @@ -294,6 +300,7 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: prometheus-statsd-exporter + namespace: prometheus labels: name: prometheus-statsd-exporter monitoring.cortex.dev: "statsd-exporter" @@ -320,6 +327,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: operator + namespace: prometheus labels: name: operator monitoring.cortex.dev: "operator" diff --git a/manager/manifests/prometheus-node-exporter.yaml b/manager/manifests/prometheus-node-exporter.yaml index 3bb631e15d..c381f22902 100644 --- a/manager/manifests/prometheus-node-exporter.yaml +++ b/manager/manifests/prometheus-node-exporter.yaml @@ -18,7 +18,7 @@ metadata: labels: app.kubernetes.io/version: v1.1.2 name: node-exporter - namespace: default + namespace: prometheus --- @@ -57,7 +57,7 @@ roleRef: subjects: - kind: ServiceAccount name: node-exporter - namespace: default + namespace: prometheus --- @@ -68,7 +68,7 @@ metadata: app.kubernetes.io/name: node-exporter app.kubernetes.io/version: v1.1.2 name: node-exporter - namespace: default + namespace: prometheus spec: clusterIP: None ports: @@ -87,7 +87,7 @@ metadata: app.kubernetes.io/name: node-exporter app.kubernetes.io/version: v1.1.2 name: node-exporter - namespace: default + namespace: prometheus spec: selector: matchLabels: @@ -181,7 +181,7 @@ metadata: app.kubernetes.io/version: v1.1.2 monitoring.cortex.dev: node-exporter name: node-exporter - namespace: default + namespace: prometheus spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token @@ -240,7 +240,7 @@ metadata: app.kubernetes.io/version: 1.1.2 prometheus: k8s name: node-exporter-rules - namespace: default + namespace: prometheus spec: groups: - name: node-exporter.rules diff --git a/manager/manifests/prometheus-operator.yaml b/manager/manifests/prometheus-operator.yaml index 3b7b558318..fc610a31a6 100644 --- a/manager/manifests/prometheus-operator.yaml +++ b/manager/manifests/prometheus-operator.yaml @@ -14073,7 +14073,7 @@ roleRef: subjects: - kind: ServiceAccount name: prometheus-operator - namespace: default + namespace: prometheus --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -14163,7 +14163,7 @@ metadata: app.kubernetes.io/name: prometheus-operator app.kubernetes.io/version: 0.48.1 name: prometheus-operator - namespace: default + namespace: prometheus spec: replicas: 1 selector: @@ -14217,7 +14217,7 @@ metadata: app.kubernetes.io/name: prometheus-operator app.kubernetes.io/version: 0.48.1 name: prometheus-operator - namespace: default + namespace: prometheus --- apiVersion: v1 kind: Service @@ -14227,7 +14227,7 @@ metadata: app.kubernetes.io/name: prometheus-operator app.kubernetes.io/version: 0.48.1 name: prometheus-operator - namespace: default + namespace: prometheus spec: clusterIP: None ports: diff --git a/manager/manifests/prometheus-statsd-exporter.yaml b/manager/manifests/prometheus-statsd-exporter.yaml index ea58db52d8..1a1fe6dd33 100644 --- a/manager/manifests/prometheus-statsd-exporter.yaml +++ b/manager/manifests/prometheus-statsd-exporter.yaml @@ -16,7 +16,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: prometheus-statsd-exporter-config - namespace: default + namespace: prometheus data: statsd-mapping.yaml: | defaults: @@ -27,7 +27,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: prometheus-statsd-exporter - namespace: default + namespace: prometheus spec: replicas: 1 selector: @@ -93,7 +93,7 @@ spec: apiVersion: v1 kind: Service metadata: - namespace: default + namespace: prometheus name: prometheus-statsd-exporter labels: cortex.dev/name: prometheus-statsd-exporter diff --git a/pkg/config/config.go b/pkg/config/config.go index 304ee1dcf3..eb7bd5e269 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -152,7 +152,7 @@ func Init() error { prometheusURL := os.Getenv("CORTEX_PROMETHEUS_URL") if len(prometheusURL) == 0 { - prometheusURL = fmt.Sprintf("http://prometheus.%s:9090", consts.DefaultNamespace) + prometheusURL = fmt.Sprintf("http://prometheus.%s:9090", consts.PrometheusNamespace) } promClient, err := promapi.NewClient(promapi.Config{ @@ -168,7 +168,7 @@ func Init() error { } if OperatorMetadata.IsOperatorInCluster { - MetricsClient, err = statsd.New(fmt.Sprintf("prometheus-statsd-exporter.%s:9125", consts.DefaultNamespace)) + MetricsClient, err = statsd.New(fmt.Sprintf("prometheus-statsd-exporter.%s:9125", consts.PrometheusNamespace)) if err != nil { return errors.Wrap(errors.WithStack(err), "unable to initialize metrics client") } diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 67a9c7e4d0..7ea590fc45 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -27,8 +27,11 @@ var ( CortexVersion = "master" // CORTEX_VERSION CortexVersionMinor = "master" // CORTEX_VERSION_MINOR - DefaultNamespace = "default" - IstioNamespace = "istio-system" + DefaultNamespace = "default" + KubeSystemNamespace = "kube-system" + IstioNamespace = "istio-system" + PrometheusNamespace = "prometheus" + LoggingNamespace = "logging" DefaultMaxQueueLength = int64(100) DefaultMaxConcurrency = int64(1) diff --git a/pkg/crds/hack/run_manager.sh b/pkg/crds/hack/run_manager.sh index 1b7d1a79fc..acd0ac9b37 100755 --- a/pkg/crds/hack/run_manager.sh +++ b/pkg/crds/hack/run_manager.sh @@ -18,7 +18,7 @@ CLUSTER_CONFIG=$1 -port_forward_cmd="kubectl port-forward -n default prometheus-prometheus-0 9090" +port_forward_cmd="kubectl port-forward -n prometheus prometheus-prometheus-0 9090" kill $(pgrep -f "${port_forward_cmd}") >/dev/null 2>&1 || true echo "Port-forwarding Prometheus to localhost:9090" diff --git a/pkg/crds/main.go b/pkg/crds/main.go index 62b552a426..ee8c0c476b 100644 --- a/pkg/crds/main.go +++ b/pkg/crds/main.go @@ -102,7 +102,7 @@ func main() { } if prometheusURL == "" { - prometheusURL = fmt.Sprintf("http://prometheus.%s:9090", consts.DefaultNamespace) + prometheusURL = fmt.Sprintf("http://prometheus.%s:9090", consts.PrometheusNamespace) } mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ diff --git a/pkg/health/health.go b/pkg/health/health.go index 45543ec220..77e26fa55f 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -22,6 +22,7 @@ import ( "reflect" "github.com/aws/aws-sdk-go/service/elbv2" + "github.com/cortexlabs/cortex/pkg/consts" awslib "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/lib/json" @@ -29,6 +30,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/types/clusterconfig" kapps "k8s.io/api/apps/v1" + kcore "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" kresource "k8s.io/apimachinery/pkg/api/resource" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -106,47 +108,47 @@ func Check(awsClient *awslib.Client, k8sClient *k8s.Client, clusterName string) if err := parallel.RunFirstErr( func() error { var err error - operatorHealth, err = getDeploymentReadiness(k8sClient, "operator", "default") + operatorHealth, err = getDeploymentReadiness(k8sClient, "operator", consts.DefaultNamespace) return err }, func() error { var err error - controllerManagerHealth, err = getDeploymentReadiness(k8sClient, "operator-controller-manager", "default") + controllerManagerHealth, err = getDeploymentReadiness(k8sClient, "operator-controller-manager", consts.DefaultNamespace) return err }, func() error { var err error - prometheusHealth, err = getStatefulSetReadiness(k8sClient, "prometheus-prometheus", "default") + prometheusHealth, err = getStatefulSetReadiness(k8sClient, "prometheus-prometheus", consts.PrometheusNamespace) return err }, func() error { var err error - autoscalerHealth, err = getDeploymentReadiness(k8sClient, "autoscaler", "default") + autoscalerHealth, err = getDeploymentReadiness(k8sClient, "autoscaler", consts.DefaultNamespace) return err }, func() error { var err error - activatorHealth, err = getDeploymentReadiness(k8sClient, "activator", "default") + activatorHealth, err = getDeploymentReadiness(k8sClient, "activator", consts.DefaultNamespace) return err }, func() error { var err error - grafanaHealth, err = getStatefulSetReadiness(k8sClient, "grafana", "default") + grafanaHealth, err = getStatefulSetReadiness(k8sClient, "grafana", consts.DefaultNamespace) return err }, func() error { var err error - operatorGatewayHealth, err = getDeploymentReadiness(k8sClient, "ingressgateway-operator", "istio-system") + operatorGatewayHealth, err = getDeploymentReadiness(k8sClient, "ingressgateway-operator", consts.IstioNamespace) return err }, func() error { var err error - apisGatewayHealth, err = getDeploymentReadiness(k8sClient, "ingressgateway-apis", "istio-system") + apisGatewayHealth, err = getDeploymentReadiness(k8sClient, "ingressgateway-apis", consts.IstioNamespace) return err }, func() error { var err error - clusterAutoscalerHealth, err = getDeploymentReadiness(k8sClient, "cluster-autoscaler", "kube-system") + clusterAutoscalerHealth, err = getDeploymentReadiness(k8sClient, "cluster-autoscaler", consts.KubeSystemNamespace) return err }, func() error { @@ -161,32 +163,32 @@ func Check(awsClient *awslib.Client, k8sClient *k8s.Client, clusterName string) }, func() error { var err error - fluentBitHealth, err = getDaemonSetReadiness(k8sClient, "fluent-bit", "default") + fluentBitHealth, err = getDaemonSetReadiness(k8sClient, "fluent-bit", consts.LoggingNamespace) return err }, func() error { var err error - dcgmExporterHealth, err = getDaemonSetReadiness(k8sClient, "dcgm-exporter", "default") + dcgmExporterHealth, err = getDaemonSetReadiness(k8sClient, "dcgm-exporter", consts.PrometheusNamespace) return err }, func() error { var err error - nodeExporterHealth, err = getDaemonSetReadiness(k8sClient, "node-exporter", "default") + nodeExporterHealth, err = getDaemonSetReadiness(k8sClient, "node-exporter", consts.PrometheusNamespace) return err }, func() error { var err error - statsdExporterHealth, err = getDeploymentReadiness(k8sClient, "prometheus-statsd-exporter", "default") + statsdExporterHealth, err = getDeploymentReadiness(k8sClient, "prometheus-statsd-exporter", consts.PrometheusNamespace) return err }, func() error { var err error - eventExporterHealth, err = getDeploymentReadiness(k8sClient, "event-exporter", "default") + eventExporterHealth, err = getDeploymentReadiness(k8sClient, "event-exporter", consts.LoggingNamespace) return err }, func() error { var err error - kubeStateMetricsHealth, err = getDeploymentReadiness(k8sClient, "kube-state-metrics", "default") + kubeStateMetricsHealth, err = getDeploymentReadiness(k8sClient, "kube-state-metrics", consts.PrometheusNamespace) return err }, ); err != nil { @@ -217,7 +219,7 @@ func Check(awsClient *awslib.Client, k8sClient *k8s.Client, clusterName string) func GetWarnings(k8sClient *k8s.Client) (ClusterWarnings, error) { var prometheusMemorySaturationWarn string - saturation, err := getPodMemorySaturation(k8sClient, "prometheus-prometheus-0", "default") + saturation, err := getPodMemorySaturation(k8sClient, "prometheus-prometheus-0", consts.PrometheusNamespace) if err != nil { return ClusterWarnings{}, err } @@ -295,8 +297,11 @@ func getLoadBalancerHealth(awsClient *awslib.Client, clusterName string, loadBal func getPodMemorySaturation(k8sClient *k8s.Client, podName, namespace string) (float64, error) { ctx := context.Background() - pod, err := k8sClient.GetPod(podName) - if err != nil { + var pod kcore.Pod + if err := k8sClient.Get(ctx, ctrlclient.ObjectKey{ + Namespace: namespace, + Name: podName, + }, &pod); err != nil { return 0, err } diff --git a/pkg/workloads/k8s.go b/pkg/workloads/k8s.go index 06e0526472..0523c8cbfe 100644 --- a/pkg/workloads/k8s.go +++ b/pkg/workloads/k8s.go @@ -17,6 +17,7 @@ limitations under the License. package workloads import ( + "fmt" "path" "strings" @@ -60,14 +61,14 @@ const ( _clusterConfigDirVolume = "cluster-config" _clusterConfigConfigMap = "cluster-config" _clusterConfigDir = "/configs/cluster" - - _statsdAddress = "prometheus-statsd-exporter.default:9125" ) var ( _asyncGatewayCPURequest = kresource.MustParse("100m") _asyncGatewayMemRequest = kresource.MustParse("100Mi") + _statsdAddress = fmt.Sprintf("prometheus-statsd-exporter.%s:9125", consts.PrometheusNamespace) + // each Inferentia chip requires 128 HugePages with each HugePage having a size of 2Mi _hugePagesMemPerInf = int64(128 * 2 * 1024 * 1024) // bytes )