Skip to content

Fix IstioD causing the Cortex cluster to no longer be reachable #2342

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Jul 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6485723
Limit istiod to 5 replicas and the default namespace
RobertLucian Jul 14, 2021
9e5dfaf
Move prometheus exporters and fluent bit to their dedicated namespace
RobertLucian Jul 14, 2021
060ebb9
Change the HPA's targets a bit
RobertLucian Jul 14, 2021
d65fafc
Redirect kubectl's output to /dev/null
RobertLucian Jul 14, 2021
4b96c20
Add logging namespace
RobertLucian Jul 14, 2021
0677208
Remove unnecessary namespace
RobertLucian Jul 14, 2021
bbcf4e3
Fixes
RobertLucian Jul 14, 2021
da66f7f
Refactoring a bit
RobertLucian Jul 14, 2021
56554fb
Fixes
RobertLucian Jul 14, 2021
77dc2e9
More refactoring
RobertLucian Jul 14, 2021
50249d8
Event exporter must be in logging namespace
RobertLucian Jul 14, 2021
23918df
Use consts where possible
RobertLucian Jul 14, 2021
a287457
Have all namespaces in a single yaml & revert temporary change
RobertLucian Jul 14, 2021
34045d4
Patch instead of applying the default namespace
RobertLucian Jul 15, 2021
7c13445
Merge branch 'master' into fix/istiod-oom
RobertLucian Jul 15, 2021
b24dff9
Fix waiting on the load balancer stage
RobertLucian Jul 15, 2021
6c9421d
More namespace fixes
RobertLucian Jul 15, 2021
7718df5
Merge branch 'master' into fix/istiod-oom
RobertLucian Jul 15, 2021
4f4daef
Create additional-scrape-configs in prometheus ns
RobertLucian Jul 15, 2021
2d572aa
Prometheus' service must be in prometheus ns
RobertLucian Jul 15, 2021
fde6ab0
Fix cortex cluster health cmd panicking
RobertLucian Jul 15, 2021
4c6b670
Fix getPodMemorySaturation function instead
RobertLucian Jul 15, 2021
49e1a25
Move all servicemonitors and podmonitors to prometheus ns
RobertLucian Jul 15, 2021
8699ee8
Address PR comments
RobertLucian Jul 15, 2021
15420b7
Merge branch 'master' into fix/istiod-oom
RobertLucian Jul 15, 2021
8b47d6e
Make lint
RobertLucian Jul 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ var _clusterConfigureCmd = &cobra.Command{
exit.Error(err)
}

k8sClient, err := k8s.New("default", false, restConfig, scheme)
k8sClient, err := k8s.New(consts.DefaultNamespace, false, restConfig, scheme)
if err != nil {
exit.Error(err)
}
Expand Down Expand Up @@ -804,7 +804,7 @@ var _clusterHealthCmd = &cobra.Command{
exit.Error(err)
}

k8sClient, err := k8s.New("default", false, restConfig, scheme)
k8sClient, err := k8s.New(consts.DefaultNamespace, false, restConfig, scheme)
if err != nil {
exit.Error(err)
}
Expand Down
17 changes: 12 additions & 5 deletions manager/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ function cluster_up() {
create_eks

echo -n "○ updating cluster configuration "
setup_namespaces
setup_configmap
echo "✓"

Expand Down Expand Up @@ -195,6 +196,12 @@ function write_kubeconfig() {
out=$(kubectl get pods 2>&1 || true); if [[ "$out" == *"must be logged in to the server"* ]]; then echo "error: your aws iam user does not have access to this cluster; to grant access, see https://docs.cortex.dev/v/${CORTEX_VERSION_MINOR}/"; exit 1; fi
}

function setup_namespaces() {
# doing a patch to prevent getting the kubectl.kubernetes.io/last-applied-configuration annotation warning
kubectl patch namespace default -p '{"metadata": {"labels": {"istio-discovery": "enabled"}}}' >/dev/null
kubectl apply -f manifests/namespaces.yaml >/dev/null
}

function setup_configmap() {
envsubst < manifests/default_cortex_cli_config.yaml > tmp_cli_config.yaml
kubectl -n=default create configmap 'client-config' \
Expand Down Expand Up @@ -227,7 +234,9 @@ function setup_prometheus() {
envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null
envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml
kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml
if ! kubectl get secret -n prometheus additional-scrape-configs >/dev/null 2>&1; then
kubectl create secret generic -n prometheus additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml > /dev/null
fi
}

function setup_grafana() {
Expand Down Expand Up @@ -360,8 +369,6 @@ function remove_nodegroups() {
}

function setup_istio() {
envsubst < manifests/istio-namespace.yaml | kubectl apply -f - >/dev/null

if ! grep -q "istio-customgateway-certs" <<< $(kubectl get secret -n istio-system); then
WEBSITE=localhost
openssl req -subj "/C=US/CN=$WEBSITE" -newkey rsa:2048 -nodes -keyout $WEBSITE.key -x509 -days 3650 -out $WEBSITE.crt >/dev/null 2>&1
Expand Down Expand Up @@ -530,8 +537,8 @@ function validate_cortex() {
fi

if [ "$prometheus_ready" == "" ]; then
readyReplicas=$(kubectl get statefulset -n default prometheus-prometheus -o jsonpath='{.status.readyReplicas}' 2> /dev/null)
desiredReplicas=$(kubectl get statefulset -n default prometheus-prometheus -o jsonpath='{.status.replicas}' 2> /dev/null)
readyReplicas=$(kubectl get statefulset -n prometheus prometheus-prometheus -o jsonpath='{.status.readyReplicas}' 2> /dev/null)
desiredReplicas=$(kubectl get statefulset -n prometheus prometheus-prometheus -o jsonpath='{.status.replicas}' 2> /dev/null)

if [ "$readyReplicas" != "" ] && [ "$desiredReplicas" != "" ]; then
if [ "$readyReplicas" == "$desiredReplicas" ]; then
Expand Down
2 changes: 1 addition & 1 deletion manager/manifests/autoscaler.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ spec:
args:
- "--in-cluster"
- "--port=8000"
- "--prometheus-url=http://prometheus.default:9090"
- "--prometheus-url=http://prometheus.prometheus:9090"
- "--namespace=default"
- "--cluster-config=/configs/cluster/cluster.yaml"
ports:
Expand Down
8 changes: 4 additions & 4 deletions manager/manifests/event-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
apiVersion: v1
kind: ServiceAccount
metadata:
namespace: default
namespace: logging
name: event-exporter

---
Expand All @@ -30,7 +30,7 @@ roleRef:
name: view
subjects:
- kind: ServiceAccount
namespace: default
namespace: logging
name: event-exporter

---
Expand All @@ -39,7 +39,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
name: event-exporter-config
namespace: default
namespace: logging
data:
config.yaml: |
logLevel: error
Expand All @@ -61,7 +61,7 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: event-exporter
namespace: default
namespace: logging
spec:
replicas: 1
selector:
Expand Down
8 changes: 4 additions & 4 deletions manager/manifests/fluent-bit.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: fluent-bit
namespace: default
namespace: logging
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
Expand All @@ -40,13 +40,13 @@ roleRef:
subjects:
- kind: ServiceAccount
name: fluent-bit
namespace: default
namespace: logging
---
apiVersion: v1
kind: ConfigMap
metadata:
name: fluent-bit-config
namespace: default
namespace: logging
labels:
k8s-app: fluent-bit
data:
Expand Down Expand Up @@ -186,7 +186,7 @@ apiVersion: apps/v1
kind: DaemonSet
metadata:
name: fluent-bit
namespace: default
namespace: logging
spec:
selector:
matchLabels:
Expand Down
2 changes: 1 addition & 1 deletion manager/manifests/grafana/grafana.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ data:
"name": "prometheus",
"orgId": 1,
"type": "prometheus",
"url": "http://prometheus.default:9090",
"url": "http://prometheus.prometheus:9090",
"version": 1,
"isDefault": true
}
Expand Down
26 changes: 23 additions & 3 deletions manager/manifests/istio.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ spec:
profile: minimal
hub: {{ env['CORTEX_IMAGE_ISTIO_PROXY_HUB'] }} # this is only used by proxy, since pilot overrides it (proxy doesn't have dedicated hub config)
tag: {{ env['CORTEX_IMAGE_ISTIO_PROXY_TAG'] }} # this is only used by proxy, since pilot overrides it (proxy doesn't have dedicated tag config)
meshConfig:
discoverySelectors:
- matchLabels:
istio-discovery: enabled
components:
pilot: # "pilot" refers to the istiod container
hub: {{ env['CORTEX_IMAGE_ISTIO_PILOT_HUB'] }}
Expand All @@ -26,7 +30,23 @@ spec:
resources:
requests:
cpu: 100m # default is 500m
memory: 200Mi # default is 2048Mi == 2Gi
memory: 700Mi # default is 2048Mi == 2Gi
hpaSpec:
minReplicas: 1
maxReplicas: 5
metrics:
- type: Resource
resource:
name: cpu
targetAverageUtilization: 90
- type: Resource
resource:
name: memory
targetAverageUtilization: 90
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: istiod
cni:
enabled: false
ingressGateways:
Expand Down Expand Up @@ -71,7 +91,7 @@ spec:
replicaCount: 1
hpaSpec:
minReplicas: 1
maxReplicas: 1 # edit autoscaleEnabled in values if increasing this
maxReplicas: 1
metrics:
- type: Resource
resource:
Expand Down Expand Up @@ -124,7 +144,7 @@ spec:
replicaCount: 1
hpaSpec:
minReplicas: 1
maxReplicas: 100 # edit autoscaleEnabled in values if increasing this
maxReplicas: 100
metrics:
- type: Resource
resource:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,16 @@ apiVersion: v1
kind: Namespace
metadata:
name: istio-system
---

apiVersion: v1
kind: Namespace
metadata:
name: logging
---

apiVersion: v1
kind: Namespace
metadata:
name: prometheus
---
11 changes: 3 additions & 8 deletions manager/manifests/prometheus-dcgm-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: v1
kind: Namespace
metadata:
name: monitoring
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: dcgm-exporter
namespace: default
namespace: prometheus
labels:
app.kubernetes.io/name: dcgm-exporter
app.kubernetes.io/instance: dcgm-exporter
Expand All @@ -31,7 +26,7 @@ apiVersion: apps/v1
kind: DaemonSet
metadata:
name: dcgm-exporter
namespace: default
namespace: prometheus
labels:
app.kubernetes.io/name: dcgm-exporter
app.kubernetes.io/instance: dcgm-exporter
Expand Down Expand Up @@ -106,7 +101,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dcgm-exporter
namespace: default
namespace: prometheus
labels:
monitoring.cortex.dev: dcgm-exporter
app.kubernetes.io/name: dcgm-exporter
Expand Down
8 changes: 4 additions & 4 deletions manager/manifests/prometheus-kube-state-metrics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
name: kube-state-metrics
namespace: default
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
Expand Down Expand Up @@ -180,13 +180,13 @@ roleRef:
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: default
namespace: prometheus
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: default
namespace: prometheus
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: "2.1.0"
Expand Down Expand Up @@ -245,7 +245,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: kube-state-metrics
namespace: default
namespace: prometheus
labels:
name: kube-state-metrics
monitoring.cortex.dev: kube-state-metrics
Expand Down
2 changes: 1 addition & 1 deletion manager/manifests/prometheus-kubelet-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ metadata:
k8s-app: kubelet
monitoring.cortex.dev: kubelet-exporter
name: kubelet
namespace: default
namespace: prometheus
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
Expand Down
10 changes: 9 additions & 1 deletion manager/manifests/prometheus-monitoring.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: prometheus
namespace: prometheus
spec:
image: $CORTEX_IMAGE_PROMETHEUS
serviceAccountName: prometheus
Expand Down Expand Up @@ -73,6 +74,7 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prometheus

---

Expand Down Expand Up @@ -114,14 +116,15 @@ roleRef:
subjects:
- kind: ServiceAccount
name: prometheus
namespace: default
namespace: prometheus

---

apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: prometheus
spec:
type: ClusterIP
ports:
Expand All @@ -136,6 +139,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: istio-stats
namespace: prometheus
labels:
monitoring.cortex.dev: "istio"
spec:
Expand Down Expand Up @@ -187,6 +191,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: proxy-stats
namespace: prometheus
labels:
monitoring.cortex.dev: "proxy"
spec:
Expand Down Expand Up @@ -240,6 +245,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: async-stats
namespace: prometheus
labels:
monitoring.cortex.dev: "dequeuer-async"
spec:
Expand Down Expand Up @@ -294,6 +300,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: prometheus-statsd-exporter
namespace: prometheus
labels:
name: prometheus-statsd-exporter
monitoring.cortex.dev: "statsd-exporter"
Expand All @@ -320,6 +327,7 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: operator
namespace: prometheus
labels:
name: operator
monitoring.cortex.dev: "operator"
Expand Down
Loading