Skip to content

Commit

Permalink
Fix Antrea IPsec e2e tests (antrea-io#1215)
Browse files Browse the repository at this point in the history
Fix Antrea IPsec e2e tests

The tests were passing even when IPsec support was broken and the
antrea-ipsec container was crashing on start. See antrea-io#1043.

We update the tests to run additional checks:
 * make sure the containers do not crash after updating the YAML (with
   IPsec -> without IPsec, and vice versa), by monitoring container
   status for 20 seconds
 * make sure a Security Association is created correctly between Nodes
   when checking inter-Node Pod connectivity

An alternative solution for the first item could have been to use
minReadySeconds for the Antrea Agent DaemonSet, but I didn't want to
modify the YAML manifest.

Fixes antrea-io#1043
  • Loading branch information
antoninbas authored Sep 10, 2020
1 parent e159995 commit 5636d3e
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 8 deletions.
16 changes: 16 additions & 0 deletions test/e2e/connectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,13 @@ func (data *TestData) redeployAntrea(t *testing.T, enableIPSec bool) {
t.Fatalf("Error when applying Antrea YAML: %v", err)
}

// After redeploying Antrea with / without IPsec, we wait for watchForRestartsDuration and
// count the number of container restarts. watchForRestartsDuration should be large enough
// to detect issues, e.g. if there is an issue with the antrea-ipsec container.
const watchForRestartsDuration = 20 * time.Second
timer := time.NewTimer(watchForRestartsDuration)
defer timer.Stop()

t.Logf("Waiting for all Antrea DaemonSet Pods")
if err := data.waitForAntreaDaemonSetPods(defaultTimeout); err != nil {
t.Fatalf("Error when restarting Antrea: %v", err)
Expand All @@ -218,6 +225,15 @@ func (data *TestData) redeployAntrea(t *testing.T, enableIPSec bool) {
if err := data.restartCoreDNSPods(defaultTimeout); err != nil {
t.Fatalf("Error when restarting CoreDNS Pods: %v", err)
}

<-timer.C
containerRestarts, err := data.getAgentContainersRestartCount()
if err != nil {
t.Fatalf("Cannot retrieve number of container restarts across Agent Pods: %v", err)
}
if containerRestarts > 0 {
t.Errorf("Unexpected container restarts (%d) after deploying new YAML", containerRestarts)
}
}

// TestPodConnectivityAfterAntreaRestart checks that restarting antrea-agent does not create
Expand Down
36 changes: 28 additions & 8 deletions test/e2e/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,25 @@ func (data *TestData) deployAntreaFlowExporter(ipfixCollector string) error {
}, false, true)
}

// getAgentContainersRestartCount reads the restart count for every container across all Antrea
// Agent Pods and returns the sum of all the read values.
func (data *TestData) getAgentContainersRestartCount() (int, error) {
listOptions := metav1.ListOptions{
LabelSelector: "app=antrea,component=antrea-agent",
}
pods, err := data.clientset.CoreV1().Pods(antreaNamespace).List(context.TODO(), listOptions)
if err != nil {
return 0, fmt.Errorf("failed to list antrea-agent Pods: %v", err)
}
containerRestarts := 0
for _, pod := range pods.Items {
for _, containerStatus := range pod.Status.ContainerStatuses {
containerRestarts += int(containerStatus.RestartCount)
}
}
return containerRestarts, nil
}

// waitForAntreaDaemonSetPods waits for the K8s apiserver to report that all the Antrea Pods are
// available, i.e. all the Nodes have one or more of the Antrea daemon Pod running and available.
func (data *TestData) waitForAntreaDaemonSetPods(timeout time.Duration) error {
Expand Down Expand Up @@ -344,13 +363,14 @@ func (data *TestData) waitForAntreaDaemonSetPods(timeout time.Duration) error {
} else if err != nil {
return err
}

return nil
}

// waitForCoreDNSPods waits for the K8s apiserver to report that all the CoreDNS Pods are available.
func (data *TestData) waitForCoreDNSPods(timeout time.Duration) error {
err := wait.PollImmediate(1*time.Second, timeout, func() (bool, error) {
deployment, err := data.clientset.AppsV1().Deployments(antreaNamespace).Get(context.TODO(), "coredns", metav1.GetOptions{})
deployment, err := data.clientset.AppsV1().Deployments("kube-system").Get(context.TODO(), "coredns", metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("error when retrieving CoreDNS deployment: %v", err)
}
Expand Down Expand Up @@ -668,7 +688,7 @@ func (data *TestData) deleteAntreaAgentOnNode(nodeName string, gracePeriodSecond
}
// we do not use DeleteCollection directly because we want to ensure the resources no longer
// exist by the time we return
pods, err := data.clientset.CoreV1().Pods("kube-system").List(context.TODO(), listOptions)
pods, err := data.clientset.CoreV1().Pods(antreaNamespace).List(context.TODO(), listOptions)
if err != nil {
return 0, fmt.Errorf("failed to list antrea-agent Pods on Node '%s': %v", nodeName, err)
}
Expand All @@ -681,13 +701,13 @@ func (data *TestData) deleteAntreaAgentOnNode(nodeName string, gracePeriodSecond
}

start := time.Now()
if err := data.clientset.CoreV1().Pods("kube-system").DeleteCollection(context.TODO(), deleteOptions, listOptions); err != nil {
if err := data.clientset.CoreV1().Pods(antreaNamespace).DeleteCollection(context.TODO(), deleteOptions, listOptions); err != nil {
return 0, fmt.Errorf("error when deleting antrea-agent Pods on Node '%s': %v", nodeName, err)
}

if err := wait.Poll(1*time.Second, timeout, func() (bool, error) {
for _, pod := range pods.Items {
if _, err := data.clientset.CoreV1().Pods("kube-system").Get(context.TODO(), pod.Name, metav1.GetOptions{}); err != nil {
if _, err := data.clientset.CoreV1().Pods(antreaNamespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}); err != nil {
if errors.IsNotFound(err) {
continue
}
Expand All @@ -705,7 +725,7 @@ func (data *TestData) deleteAntreaAgentOnNode(nodeName string, gracePeriodSecond

// wait for new antrea-agent Pod
if err := wait.Poll(1*time.Second, timeout, func() (bool, error) {
pods, err := data.clientset.CoreV1().Pods("kube-system").List(context.TODO(), listOptions)
pods, err := data.clientset.CoreV1().Pods(antreaNamespace).List(context.TODO(), listOptions)
if err != nil {
return false, fmt.Errorf("failed to list antrea-agent Pods on Node '%s': %v", nodeName, err)
}
Expand Down Expand Up @@ -774,7 +794,7 @@ func (data *TestData) restartAntreaControllerPod(timeout time.Duration) (*corev1
var newPod *corev1.Pod
// wait for new antrea-controller Pod
if err := wait.Poll(1*time.Second, timeout, func() (bool, error) {
pods, err := data.clientset.CoreV1().Pods("kube-system").List(context.TODO(), listOptions)
pods, err := data.clientset.CoreV1().Pods(antreaNamespace).List(context.TODO(), listOptions)
if err != nil {
return false, fmt.Errorf("failed to list antrea-controller Pods: %v", err)
}
Expand Down Expand Up @@ -1042,13 +1062,13 @@ func (data *TestData) doesOVSPortExist(antreaPodName string, portName string) (b
}

func (data *TestData) GetEncapMode() (config.TrafficEncapModeType, error) {
mapList, err := data.clientset.CoreV1().ConfigMaps("kube-system").List(context.TODO(), metav1.ListOptions{})
mapList, err := data.clientset.CoreV1().ConfigMaps(antreaNamespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
return config.TrafficEncapModeInvalid, err
}
for _, m := range mapList.Items {
if strings.HasPrefix(m.Name, "antrea-config") {
configMap, err := data.clientset.CoreV1().ConfigMaps("kube-system").Get(context.TODO(), m.Name, metav1.GetOptions{})
configMap, err := data.clientset.CoreV1().ConfigMaps(antreaNamespace).Get(context.TODO(), m.Name, metav1.GetOptions{})
if err != nil {
return config.TrafficEncapModeInvalid, err
}
Expand Down
42 changes: 42 additions & 0 deletions test/e2e/ipsec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
package e2e

import (
"fmt"
"regexp"
"strconv"
"testing"
"time"

Expand All @@ -23,6 +26,34 @@ import (
"github.com/vmware-tanzu/antrea/pkg/agent/util"
)

func (data *TestData) readSecurityAssociationsStatus(nodeName string) (up int, connecting int, err error) {
antreaPodName, err := data.getAntreaPodOnNode(nodeName)
if err != nil {
return 0, 0, err
}
cmd := []string{"ipsec", "status"}
stdout, stderr, err := data.runCommandFromPod(antreaNamespace, antreaPodName, "antrea-ipsec", cmd)
if err != nil {
return 0, 0, fmt.Errorf("error when running 'ipsec status' on '%s': %v - stdout: %s - stderr: %s", nodeName, err, stdout, stderr)
}
re := regexp.MustCompile(`Security Associations \((\d+) up, (\d+) connecting\)`)
matches := re.FindStringSubmatch(stdout)
if len(matches) == 0 {
return 0, 0, fmt.Errorf("unexpected 'ipsec status' output: %s", stdout)
}
if v, err := strconv.ParseUint(matches[1], 10, 32); err != nil {
return 0, 0, fmt.Errorf("error when retrieving 'up' SAs from 'ipsec status' output: %v", err)
} else {
up = int(v)
}
if v, err := strconv.ParseUint(matches[2], 10, 32); err != nil {
return 0, 0, fmt.Errorf("error when retrieving 'connecting' SAs from 'ipsec status' output: %v", err)
} else {
connecting = int(v)
}
return up, connecting, nil
}

// TestIPSecTunnelConnectivity checks that Pod traffic across two Nodes over
// the IPSec tunnel, by creating multiple Pods across distinct Nodes and having
// them ping each other.
Expand All @@ -41,6 +72,17 @@ func TestIPSecTunnelConnectivity(t *testing.T) {

data.testPodConnectivityDifferentNodes(t)

// We know that testPodConnectivityDifferentNodes always creates a Pod on Node 0 for the
// inter-Node ping test.
nodeName := nodeName(0)
if up, _, err := data.readSecurityAssociationsStatus(nodeName); err != nil {
t.Errorf("Error when reading Security Associations: %v", err)
} else if up == 0 {
t.Errorf("Expected at least one 'up' Security Association, but got %d", up)
} else {
t.Logf("Found %d 'up' SecurityAssociation(s) for Node '%s'", up, nodeName)
}

// Restore normal Antrea deployment with IPSec disabled.
data.redeployAntrea(t, false)
}
Expand Down

0 comments on commit 5636d3e

Please sign in to comment.