From ed02f5de52a31d426190e4e62d790a7cab717ab5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 17 Oct 2024 11:04:36 -0400 Subject: [PATCH] tests: ipsec: Add NxN + reconciliation test. Add a test to check establishment of IPsec connections among multiple nodes and check the reconciliation logic along the way. The test: - Creates 20 network namespaces. - Starts Libreswan, OVS and ovs-monitor-ipsec in each of them. - Adds a geneve tunnel from each namespace to every other namespace. - Checks that each namespace has all the IPsec connections loaded. - Removes a few connections manually. - Checks that these connections are added back. Unfortunately, many widely used versions of Libreswan have issues of pluto crashing frequently. For that reason the test is trying to bring pluto back online once it finds a dead one. Also, since retransmit-timeout is 60 seconds and our command timeout is 120, we can't actually use the OVS_WAIT_UNTIL macro most of the time, so the checks are done in the custom loop that waits up to 300 seconds. Signed-off-by: Ilya Maximets --- tests/system-ipsec.at | 138 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 125 insertions(+), 13 deletions(-) diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at index 1e155fecea3..5aa67bf1d01 100644 --- a/tests/system-ipsec.at +++ b/tests/system-ipsec.at @@ -8,6 +8,18 @@ m4_define([IPSEC_SETUP_UNDERLAY], dnl Set up the underlay switch AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])]) +m4_define([START_PLUTO], [ + rm -f $ovs_base/$1/pluto.pid + mkdir -p $ovs_base/$1/ipsec.d + touch $ovs_base/$1/ipsec.conf + touch $ovs_base/$1/secrets + ipsec initnss --nssdir $ovs_base/$1/ipsec.d + NS_CHECK_EXEC([$1], [ipsec pluto --config $ovs_base/$1/ipsec.conf \ + --ipsecdir $ovs_base/$1 --nssdir $ovs_base/$1/ipsec.d \ + --logfile $ovs_base/$1/pluto.log --secretsfile $ovs_base/$1/secrets \ + --rundir $ovs_base/$1], [0], [], [stderr]) +]) + dnl IPSEC_ADD_NODE([namespace], [device], [address], [peer address])) dnl dnl Creates a dummy host that acts as an IPsec endpoint. Creates host in @@ -45,15 +57,8 @@ m4_define([IPSEC_ADD_NODE], on_exit "kill_ovs_vswitchd `cat $ovs_base/$1/vswitchd.pid`" dnl Start pluto - mkdir -p $ovs_base/$1/ipsec.d - touch $ovs_base/$1/ipsec.conf - touch $ovs_base/$1/secrets - ipsec initnss --nssdir $ovs_base/$1/ipsec.d - NS_CHECK_EXEC([$1], [ipsec pluto --config $ovs_base/$1/ipsec.conf \ - --ipsecdir $ovs_base/$1 --nssdir $ovs_base/$1/ipsec.d \ - --logfile $ovs_base/$1/pluto.log --secretsfile $ovs_base/$1/secrets \ - --rundir $ovs_base/$1], [0], [], [stderr]) - on_exit "kill `cat $ovs_base/$1/pluto.pid`" + START_PLUTO([$1]) + on_exit 'kill $(cat $ovs_base/$1/pluto.pid)' dnl Start ovs-monitor-ipsec NS_CHECK_EXEC([$1], [ovs-monitor-ipsec unix:${OVS_RUNDIR}/$1/db.sock\ @@ -110,16 +115,18 @@ m4_define([CHECK_LIBRESWAN], dnl IPSEC_STATUS_LOADED([]) dnl dnl Get number of loaded connections from ipsec status -m4_define([IPSEC_STATUS_LOADED], [ipsec --rundir $ovs_base/$1 status | \ +m4_define([IPSEC_STATUS_LOADED], [ + ipsec --rundir $ovs_base/$1 status | \ grep "Total IPsec connections" | \ - sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\1/m']) + sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-9]]*\), active \([[0-9]]*\).*/\1/m']) dnl IPSEC_STATUS_ACTIVE([]) dnl dnl Get number of active connections from ipsec status -m4_define([IPSEC_STATUS_ACTIVE], [ipsec --rundir $ovs_base/$1 status | \ +m4_define([IPSEC_STATUS_ACTIVE], [ + ipsec --rundir $ovs_base/$1 status | \ grep "Total IPsec connections" | \ - sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\2/m']) + sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-9]]*\), active \([[0-9]]*\).*/\2/m']) dnl CHECK_ESP_TRAFFIC() dnl @@ -401,3 +408,108 @@ CHECK_ESP_TRAFFIC OVS_TRAFFIC_VSWITCHD_STOP() AT_CLEANUP + +AT_SETUP([IPsec -- Libreswan NxN geneve tunnels + reconciliation]) +AT_KEYWORDS([ipsec libreswan scale reconciliation]) +dnl Note: Geneve test may not work on older kernels due to CVE-2020-25645 +dnl https://bugzilla.redhat.com/show_bug.cgi?id=1883988 + +CHECK_LIBRESWAN() +OVS_TRAFFIC_VSWITCHD_START() +IPSEC_SETUP_UNDERLAY() + +m4_define([NODES], [20]) + +dnl Set up fake hosts. +m4_for([id], [1], NODES, [1], [ + IPSEC_ADD_NODE([node-id], [p-id], 10.1.1.id, 10.1.1.254) + AT_CHECK([ovs-pki -b -d ${ovs_base} -l ${ovs_base}/ovs-pki.log \ + req -u node-id], [0], [stdout]) + AT_CHECK([ovs-pki -b -d ${ovs_base} -l ${ovs_base}/ovs-pki.log \ + self-sign node-id], [0], [stdout]) + AT_CHECK(OVS_VSCTL([node-id], set Open_vSwitch . \ + other_config:certificate=${ovs_base}/node-id-cert.pem \ + other_config:private_key=${ovs_base}/node-id-privkey.pem), + [0], [ignore], [ignore]) + on_exit "ipsec --rundir $ovs_base/node-id status > $ovs_base/node-id/status" +]) + +dnl Create a full mesh of tunnels. +m4_for([LEFT], [1], NODES, [1], [ + m4_for([RIGHT], [1], NODES, [1], [ + if test LEFT -ne RIGHT; then + AT_CHECK(OVS_VSCTL(node-LEFT, add-port br-ipsec tun-RIGHT \ + -- set Interface tun-RIGHT type=geneve options:remote_ip=10.1.1.RIGHT \ + options:remote_cert=${ovs_base}/node-RIGHT-cert.pem), + [0], [ignore], [ignore]) + fi +])]) + +m4_define([WAIT_FOR_LOADED_CONNS], [ + m4_for([id], [1], NODES, [1], [ + echo "================== node-id =========================" + iterations=0 + loaded=0 + dnl Using a custom loop instead of OVS_WAIT_UNTIL, because it may take + dnl much longer than a default timeout. The default retransmit timeout + dnl for pluto is 60 seconds. Also, we need to make sure pluto didn't + dnl crash in the process and revive it if it did, unfortunately. + while true; do + date + AT_CHECK([ipsec --rundir $ovs_base/node-id status 2>&1 \ + | grep -E "whack|Total"], [ignore], [stdout]) + if grep -E 'is Pluto running?|refused' stdout; then + echo "node-id: Pluto died, restarting..." + START_PLUTO([node-id]) + else + loaded=$(IPSEC_STATUS_LOADED(node-id)) + fi + if test "$loaded" -ne $(( (NODES - 1) * 2 )); then + sleep 3 + else + break + fi + let iterations=$iterations+1 + AT_CHECK([test $iterations -lt 100]) + done + ]) +]) + +dnl Wait for all the connections to be loaded to pluto. Not waiting for +dnl them to become active, because if pluto is down on one of the nodes, +dnl some connections may not become active until we revive it. Some +dnl connections may also never become active due to bugs in libreswan 4.x. +WAIT_FOR_LOADED_CONNS() + +AT_CHECK([ipsec auto --help], [ignore], [ignore], [stderr]) +auto=auto +if test -s stderr; then + auto= +fi + +dnl Remove connections for two tunnels. One fully and one partially. +AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \ + --config $ovs_base/node-1/ipsec.conf \ + --delete tun-5-out-1], [0], [stdout]) +AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \ + --config $ovs_base/node-1/ipsec.conf \ + --delete tun-2-in-1], [0], [stdout]) +AT_CHECK([ipsec $auto --ctlsocket $ovs_base/node-1/pluto.ctl \ + --config $ovs_base/node-1/ipsec.conf \ + --delete tun-2-out-1], [0], [stdout]) + +dnl Wait for the monitor to notice the missing connections. +OVS_WAIT_UNTIL([grep -q 'tun-2.*need to reconcile' \ + $ovs_base/node-1/ovs-monitor-ipsec.log]) + +dnl Wait for all the connections to be loaded back. +WAIT_FOR_LOADED_CONNS() + +dnl These are not necessary, but nice to have in the test log in +dnl order to spot pluto failures during the test. +grep -E 'timed out|outdated|half-loaded|defunct' \ + $ovs_base/node-*/ovs-monitor-ipsec.log +grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log + +OVS_TRAFFIC_VSWITCHD_STOP() +AT_CLEANUP