Skip to content

Commit

Permalink
n
Browse files Browse the repository at this point in the history
  • Loading branch information
matti committed May 23, 2024
1 parent 750e130 commit c616672
Showing 1 changed file with 52 additions and 36 deletions.
88 changes: 52 additions & 36 deletions app/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
#!/usr/bin/env bash
set -euo pipefail
set -eEuo pipefail

_output() {
echo ""
echo "++ $*"
}

echo "start;$(date);$(uptime)" >> /k8s-node-watchdog/log

Expand All @@ -25,12 +30,12 @@ case "$PROVIDER" in
echo "fixing node-problem-detector by installing the missing python"
(
exec nsenter -t 1 -m -u -i -n -- apt-get update
) 2>&1 >/tmp/apt-get-update.log
) >/tmp/apt-get-update.log 2>&1
echo "apt-get update ok"

(
exec nsenter -t 1 -m -u -i -n -- apt-get install -y python
) 2>&1 >/tmp/apt-get-install-python.log
) >/tmp/apt-get-install-python.log 2>&1
echo "apt-get install -y python ok"
fi
;;
Expand All @@ -39,57 +44,68 @@ case "$PROVIDER" in
aws)
echo "no startup tasks for aws"
;;
*)
echo "unsupported provider: ${PROVIDER}"
unknown)
echo "unsupported provider"
sleep 60
exit 1
;;
esac

while true; do
while true; do
set +e
nsenter -t 1 -m -u -i -n -- uptime
set -e
_output "nsenter uptime ..."
if ! nsenter -t 1 -m -u -i -n -- uptime
then
_output "nsenter failed to uptime, strange"
fi

if ! kubectl get node >/dev/null; then
echo "kube api not ok"
else
case "${PROVIDER}" in
aws)
# https://github.com/weaveworks/eksctl/issues/2363#issuecomment-830651744
# and
# Failed to validate kubelet flags" err="unknown 'kubernetes.io' or 'k8s.io' labels specified with --node-labels: [node-role.kubernetes.io/something]\n--node-labels in the 'kubernetes.io' namespace must begin with an allowed prefix (kubelet.kubernetes.io, node.kubernetes.io) or be in the specifically allowed set (beta.kubernetes.io/arch, beta.kubernetes.io/instance-type, beta.kubernetes.io/os, failure-domain.beta.kubernetes.io/region, failure-domain.beta.kubernetes.io/zone, kubernetes.io/arch, kubernetes.io/hostname, kubernetes.io/os, node.kubernetes.io/instance-type, topology.kubernetes.io/region, topology.kubernetes.io/zone)"
_output "kubectl get node ..."
if ! kubectl get node >/dev/null
then
_output "failed, will try again"
break
fi

set +e
nodegroup=$(kubectl get node "${NODE_HOSTNAME}" --output=jsonpath='{.metadata.labels.eks\.amazonaws\.com\/nodegroup}')
set -e
_output "running"
case "${PROVIDER}" in
aws)
# https://github.com/weaveworks/eksctl/issues/2363#issuecomment-830651744
# and
# Failed to validate kubelet flags" err="unknown 'kubernetes.io' or 'k8s.io' labels specified with --node-labels: [node-role.kubernetes.io/something]\n--node-labels in the 'kubernetes.io' namespace must begin with an allowed prefix (kubelet.kubernetes.io, node.kubernetes.io) or be in the specifically allowed set (beta.kubernetes.io/arch, beta.kubernetes.io/instance-type, beta.kubernetes.io/os, failure-domain.beta.kubernetes.io/region, failure-domain.beta.kubernetes.io/zone, kubernetes.io/arch, kubernetes.io/hostname, kubernetes.io/os, node.kubernetes.io/instance-type, topology.kubernetes.io/region, topology.kubernetes.io/zone)"

if [ "$nodegroup" = "" ]; then
echo "failed to read nodegroup"
break
fi
_output "get this node ..."
nodegroup=$(kubectl get node "${NODE_HOSTNAME}" --output=jsonpath='{.metadata.labels.eks\.amazonaws\.com\/nodegroup}' || true)

kubectl label node "${NODE_HOSTNAME}" --overwrite=true "node-role.kubernetes.io/${nodegroup}=yes" || echo "node labeling failed"
if [ "$nodegroup" = "" ]; then
_output "nodegroup not found, will try again"
break
fi

kubectl annotate pod --overwrite=true -n kube-system -l eks.amazonaws.com/component=coredns "cluster-autoscaler.kubernetes.io/safe-to-evict=true" || echo "annotating coredns failed"
_output "labeling node ..."
kubectl label node "${NODE_HOSTNAME}" --overwrite=true "node-role.kubernetes.io/${nodegroup}=yes" || _output "node labeling failed"

kubectl patch deployment -n kube-system coredns --patch-file /app/tolerations.yml || echo "patching coredns tolerations failed"
_output "annotating codedns to safe-to-evict ..."
kubectl annotate pod --overwrite=true -n kube-system -l eks.amazonaws.com/component=coredns "cluster-autoscaler.kubernetes.io/safe-to-evict=true" || _output "annotating coredns failed"

# nowdays set to maxUnavailable: 1
#kubectl apply -f /app/coredns-pdb.yml || echo "coredns pdb apply failed"
_output "patching coredns tolerations ..."
kubectl patch deployment -n kube-system coredns --patch-file /app/tolerations.yml || _output "patching coredns tolerations failed"

# see https://github.com/aws/amazon-vpc-cni-k8s/issues/1930
#kubectl patch daemonset -n kube-system aws-node --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/livenessProbe/initialDelaySeconds", "value":1}]'
# nowdays set to maxUnavailable: 1
#kubectl apply -f /app/coredns-pdb.yml || echo "coredns pdb apply failed"

#kubectl patch deployment -n kube-system coredns --patch-file /app/coredns-topologyspreadconstraints.yml || echo "patching coredns topologySpreadConstraints failed"
#kubectl autoscale deployment coredns -n kube-system --cpu-percent=5 --min=2 --max=9 || echo "autoscale coredns apply failed"
;;
esac
fi
# see https://github.com/aws/amazon-vpc-cni-k8s/issues/1930
#kubectl patch daemonset -n kube-system aws-node --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/livenessProbe/initialDelaySeconds", "value":1}]'

#kubectl patch deployment -n kube-system coredns --patch-file /app/coredns-topologyspreadconstraints.yml || echo "patching coredns topologySpreadConstraints failed"
#kubectl autoscale deployment coredns -n kube-system --cpu-percent=5 --min=2 --max=9 || echo "autoscale coredns apply failed"
;;
unknown)
:
;;
esac

break
done

sleep 30
sleep 15
done

0 comments on commit c616672

Please sign in to comment.