From e5371a9f86765586ea3cc02ade2f2757a742e3e7 Mon Sep 17 00:00:00 2001 From: panpan0000 Date: Tue, 27 Jul 2021 23:34:04 -0400 Subject: [PATCH] Enhance the detection logic for host iptables mode based on kubelet chain ----- this is a cherry-pick from https://github.com/kubernetes/release/pull/1548 ---- As we know, kube-proxy should ensure its iptables running in the same mode with the one in host OS. But the current login (the iptables-wrapper ) sometimes does not work well. For example, we saw /issues/80462 couple of times on Oracle Linux 8. I couldn't reveal the root cause at that time, But obviously the iptables-wrapper does not work out at the beginning. Maybe there's some other iptables rules exist before kube-proxy runs. And when iptables-wrapper didn't work out at the very beginning, then the kube-proxy rules will be created in legacy mode, then even kube-proxy reboots, the legacy-rules counter will always greater than the nft-rules. the wrapper script will be always miss-leaded(always run in legacy mode). moreover , as state in the original code This assumes that some non-containerized process (eg # kubelet) has already created some iptables rules.. So my fix just make this assumption more explicit : by reading the kubelet code, those chains -- KUBE-MARK-DROP | KUBE-MARK-MASQ | KUBE-POSTROUTING are created when kubelet runs. So I put a new logic at the top of the wrapper. last but not least, actually, there's a chance for wrapper to be failing: assuming we run kube-proxy pod right after kubelet service starts. There will be seconds for kubelet to walk thru the code until creating those chains, between this small time window, the kube-proxy may already be mis-lead. This failing case also apply for the original logic of wrapper. And another solution is not "Auto-Detection" , but "Specific the mode" , to provide information manually: if [ "${IPTABLE_MODE}" != "" ]; then mode=${IPTABLE_MODE} else ..... # following are the original logic But it requires kube-proxy to add env variable in diff environment, not quite user friendly. Which issue(s) this PR fixes: kubernetes/kubernetes#80462 Moreover, both calico will suffer from this kind of issue: unable to auto detect the correct iptable mode. https://github.com/projectcalico/calico/issues/3709 When localnodedns pod starts before calico-node pod, because localnodedns always uses legacy mode. so when calico-node starts, legacy rules will win and calico will be misled. --- iptables-wrapper-installer.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/iptables-wrapper-installer.sh b/iptables-wrapper-installer.sh index 5e58c67855e..c4049b2b65f 100755 --- a/iptables-wrapper-installer.sh +++ b/iptables-wrapper-installer.sh @@ -109,12 +109,22 @@ set -eu # kubelet) has already created some iptables rules. EOF +cat >> "${sbin}/iptables-wrapper" </dev/null | grep '^-' | wc -l) +chains_created_by_kubelet=":KUBE-MARK-DROP|:KUBE-MARK-MASQ|:KUBE-POSTROUTING" +kubelet_legacy_chains=\$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep -E \${chains_created_by_kubelet} | wc -l) + +EOF + + if [ "${need_timeout:-0}" = 0 ]; then # Write out the simpler version of legacy-vs-nft detection cat >> "${sbin}/iptables-wrapper" </dev/null | grep '^-' | wc -l) +kubelet_nft_chains=\$( (iptables-nft-save || true; ip6tables-nft-save || true) 2>/dev/null | grep -E \${chains_created_by_kubelet} | wc -l) num_nft_lines=\$( (iptables-nft-save || true; ip6tables-nft-save || true) 2>/dev/null | grep '^-' | wc -l) -if [ "\${num_legacy_lines}" -ge "\${num_nft_lines}" ]; then +if [ "\${kubelet_nft_chains}" -gt "\${kubelet_legacy_chains}" ]; then + mode=nft +elif [ "\${num_legacy_lines}" -ge "\${num_nft_lines}" ]; then mode=legacy else mode=nft @@ -127,8 +137,10 @@ else # loop if nft is not available so we need to wrap a timeout around it # (and to avoid that, we don't even bother calling iptables-nft if it # looks like iptables-legacy is going to win). -num_legacy_lines=\$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep '^-' | wc -l) -if [ "\${num_legacy_lines}" -ge 10 ]; then +kubelet_nft_chains=\$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep -E \${chains_created_by_kubelet} | wc -l) +if [ "\${kubelet_nft_chains}" -gt "\${kubelet_legacy_chains}" ]; then + mode=nft +elif [ "\${num_legacy_lines}" -ge 10 ]; then mode=legacy else num_nft_lines=\$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep '^-' | wc -l)