diff --git a/.github/workflows/pr-kubernetes-tests.yaml b/.github/workflows/pr-kubernetes-tests.yaml index d4af78516fd..7e7a9690bc7 100644 --- a/.github/workflows/pr-kubernetes-tests.yaml +++ b/.github/workflows/pr-kubernetes-tests.yaml @@ -84,7 +84,7 @@ jobs: # October 10, 2024: 12 minutes - cluster-name: 'cluster-six' go-test-args: '-v -timeout=25m' - go-test-run-regex: '^TestDiscoveryWatchlabels$$|^TestK8sGatewayNoValidation$$|^TestHelm$$|^TestHelmSettings$$|^TestK8sGatewayAws$$' + go-test-run-regex: '^TestDiscoveryWatchlabels$$|^TestK8sGatewayNoValidation$$|^TestHelm$$|^TestHelmSettings$$|^TestK8sGatewayAws$$|^TestZeroDowntimeRollout$$' # In our PR tests, we run the suite of tests using the upper ends of versions that we claim to support # The versions should mirror: https://docs.solo.io/gloo-edge/latest/reference/support/ diff --git a/pkg/utils/cmdutils/cmd.go b/pkg/utils/cmdutils/cmd.go index 1532d11df3a..b69980b384a 100644 --- a/pkg/utils/cmdutils/cmd.go +++ b/pkg/utils/cmdutils/cmd.go @@ -11,6 +11,17 @@ type Cmd interface { // It returns a *RunError if there is any error, nil otherwise Run() *RunError + // Start starts the command but doesn't block + // If the returned error is non-nil, it should be of type *RunError + Start() *RunError + + // Wait waits for the command to finish + // If the returned error is non-nil, it should be of type *RunError + Wait() *RunError + + // Output returns the output of the executed command + Output() []byte + // WithEnv sets the Env variables for the Cmd // Each entry should be of the form "key=value" WithEnv(...string) Cmd diff --git a/pkg/utils/cmdutils/local.go b/pkg/utils/cmdutils/local.go index ed0509e0b77..13d371c1990 100644 --- a/pkg/utils/cmdutils/local.go +++ b/pkg/utils/cmdutils/local.go @@ -29,8 +29,10 @@ type LocalCmder struct{} // Command returns a Cmd which includes the running process's `Environment` func (c *LocalCmder) Command(ctx context.Context, name string, arg ...string) Cmd { + var combinedOutput threadsafe.Buffer cmd := &LocalCmd{ - Cmd: exec.CommandContext(ctx, name, arg...), + Cmd: exec.CommandContext(ctx, name, arg...), + combinedOutput: &combinedOutput, } // By default, assign the env variables for the command @@ -41,6 +43,7 @@ func (c *LocalCmder) Command(ctx context.Context, name string, arg ...string) Cm // LocalCmd wraps os/exec.Cmd, implementing the cmdutils.Cmd interface type LocalCmd struct { *exec.Cmd + combinedOutput *threadsafe.Buffer } // WithEnv sets env @@ -93,3 +96,41 @@ func (cmd *LocalCmd) Run() *RunError { } return nil } + +// Start starts the command but doesn't block +// If the returned error is non-nil, it should be of type *RunError +func (cmd *LocalCmd) Start() *RunError { + + cmd.Stdout = io.MultiWriter(cmd.Stdout, cmd.combinedOutput) + cmd.Stderr = io.MultiWriter(cmd.Stderr, cmd.combinedOutput) + + if err := cmd.Cmd.Start(); err != nil { + return &RunError{ + command: cmd.Args, + output: cmd.combinedOutput.Bytes(), + inner: err, + stackTrace: errors.WithStack(err), + } + } + return nil +} + +// Wait waits for the command to finish +// If the returned error is non-nil, it should be of type *RunError +func (cmd *LocalCmd) Wait() *RunError { + if err := cmd.Cmd.Wait(); err != nil { + return &RunError{ + command: cmd.Args, + output: cmd.combinedOutput.Bytes(), + inner: err, + stackTrace: errors.WithStack(err), + } + } + return nil +} + +// Output returns the output of the command +// If the returned error is non-nil, it should be of type *RunError +func (cmd *LocalCmd) Output() []byte { + return cmd.combinedOutput.Bytes() +} diff --git a/test/kubernetes/e2e/features/zero_downtime_rollout/suite.go b/test/kubernetes/e2e/features/zero_downtime_rollout/suite.go new file mode 100644 index 00000000000..1890dda86ea --- /dev/null +++ b/test/kubernetes/e2e/features/zero_downtime_rollout/suite.go @@ -0,0 +1,126 @@ +package zero_downtime_rollout + +import ( + "context" + "net/http" + "time" + + . "github.com/onsi/gomega" + "github.com/stretchr/testify/suite" + + "github.com/solo-io/gloo/pkg/utils/kubeutils" + "github.com/solo-io/gloo/pkg/utils/requestutils/curl" + testmatchers "github.com/solo-io/gloo/test/gomega/matchers" + "github.com/solo-io/gloo/test/kubernetes/e2e" + "github.com/solo-io/gloo/test/kubernetes/e2e/defaults" + "github.com/solo-io/gloo/test/kubernetes/e2e/tests/base" +) + +type testingSuite struct { + *base.BaseTestingSuite +} + +func NewTestingSuite(ctx context.Context, testInst *e2e.TestInstallation) suite.TestingSuite { + return &testingSuite{ + base.NewBaseTestingSuite(ctx, testInst, e2e.MustTestHelper(ctx, testInst), base.SimpleTestCase{}, zeroDowntimeTestCases), + } +} + +func (s *testingSuite) TestZeroDowntimeRollout() { + // Ensure the gloo gateway pod is up and running + s.TestInstallation.Assertions.EventuallyRunningReplicas(s.Ctx, glooProxyObjectMeta, Equal(1)) + s.TestInstallation.Assertions.AssertEventualCurlResponse( + s.Ctx, + defaults.CurlPodExecOpt, + []curl.Option{ + curl.WithHost(kubeutils.ServiceFQDN(proxyService.ObjectMeta)), + curl.WithHostHeader("example.com"), + }, + &testmatchers.HttpResponse{ + StatusCode: http.StatusOK, + }) + + // Send traffic to the gloo gateway pod while we restart the deployment + // Run this for 30s which is long enough to restart the deployment since there's no easy way + // to stop this command once the test is over + // This executes 600 req @ 4 req/sec = 15s (2 * terminationGracePeriodSeconds (5) + buffer) + // kubectl exec -n hey hey -- hey -disable-keepalive -c 4 -q 10 --cpus 1 -n 1200 -m GET -t 1 -host example.com http://gloo-proxy-gw.default.svc.cluster.local:8080 + args := []string{"exec", "-n", "hey", "hey", "--", "hey", "-disable-keepalive", "-c", "4", "-q", "10", "--cpus", "1", "-n", "600", "-m", "GET", "-t", "1", "-host", "example.com", "http://gloo-proxy-gw.default.svc.cluster.local:8080"} + + var err error + cmd := s.TestHelper.Cli.Command(s.Ctx, args...) + err = cmd.Start() + Expect(err).ToNot(HaveOccurred()) + + // Restart the deployment. There should be no downtime since the gloo gateway pod should have the readiness probes configured + err = s.TestHelper.RestartDeploymentAndWait(s.Ctx, "gloo-proxy-gw") + Expect(err).ToNot(HaveOccurred()) + + time.Sleep(1 * time.Second) + + // We're just flexing at this point + err = s.TestHelper.RestartDeploymentAndWait(s.Ctx, "gloo-proxy-gw") + Expect(err).ToNot(HaveOccurred()) + + now := time.Now() + err = cmd.Wait() + Expect(err).ToNot(HaveOccurred()) + + // Since there's no easy way to stop the command after we've restarted the deployment, + // we ensure that at least 1 second has passed since we began sending traffic to the gloo gateway pod + after := int(time.Now().Sub(now).Abs().Seconds()) + s.GreaterOrEqual(after, 1) + + // Summary: + // Total: 30.0113 secs + // Slowest: 0.0985 secs + // Fastest: 0.0025 secs + // Average: 0.0069 secs + // Requests/sec: 39.9849 + // + // Total data: 738000 bytes + // Size/request: 615 bytes + // + // Response time histogram: + // 0.003 [1] | + // 0.012 [1165] |■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ + // 0.022 [24] |■ + // 0.031 [4] | + // 0.041 [0] | + // 0.050 [0] | + // 0.060 [0] | + // 0.070 [0] | + // 0.079 [0] | + // 0.089 [1] | + // 0.098 [5] | + // + // Latency distribution: + // 10% in 0.0036 secs + // 25% in 0.0044 secs + // 50% in 0.0060 secs + // 75% in 0.0082 secs + // 90% in 0.0099 secs + // 95% in 0.0109 secs + // 99% in 0.0187 secs + // + // Details (average, fastest, slowest): + // DNS+dialup: 0.0028 secs, 0.0025 secs, 0.0985 secs + // DNS-lookup: 0.0016 secs, 0.0001 secs, 0.0116 secs + // req write: 0.0003 secs, 0.0001 secs, 0.0041 secs + // resp wait: 0.0034 secs, 0.0012 secs, 0.0782 secs + // resp read: 0.0003 secs, 0.0001 secs, 0.0039 secs + // + // Status code distribution: + // [200] 600 responses + // + // ***** Should not contain something like this ***** + // Status code distribution: + // [200] 579 responses + // Error distribution: + // [17] Get http://gloo-proxy-gw.default.svc.cluster.local:8080: dial tcp 10.96.177.91:8080: connection refused + // [4] Get http://gloo-proxy-gw.default.svc.cluster.local:8080: net/http: request canceled while waiting for connection + + // Verify that there were no errors + Expect(cmd.Output()).To(ContainSubstring("[200] 600 responses")) + Expect(cmd.Output()).ToNot(ContainSubstring("Error distribution")) +} diff --git a/test/kubernetes/e2e/features/zero_downtime_rollout/testdata/route-with-service.yaml b/test/kubernetes/e2e/features/zero_downtime_rollout/testdata/route-with-service.yaml new file mode 100644 index 00000000000..490dd34b4c3 --- /dev/null +++ b/test/kubernetes/e2e/features/zero_downtime_rollout/testdata/route-with-service.yaml @@ -0,0 +1,56 @@ +kind: Gateway +apiVersion: gateway.networking.k8s.io/v1 +metadata: + name: gw +spec: + gatewayClassName: gloo-gateway + listeners: + - protocol: HTTP + port: 8080 + name: http + allowedRoutes: + namespaces: + from: Same +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: example-route +spec: + parentRefs: + - name: gw + hostnames: + - "example.com" + rules: + - backendRefs: + - name: example-svc + port: 8080 +--- +apiVersion: v1 +kind: Namespace +metadata: + name: hey +--- +apiVersion: v1 +kind: Pod +metadata: + name: hey + namespace: hey + labels: + app: hey + version: v1 + app.kubernetes.io/name: hey +spec: + containers: + - name: hey + image: ricoli/hey + imagePullPolicy: IfNotPresent + command: + - "tail" + - "-f" + - "/dev/null" + resources: + requests: + cpu: "100m" + limits: + cpu: "200m" diff --git a/test/kubernetes/e2e/features/zero_downtime_rollout/testdata/service-for-route.yaml b/test/kubernetes/e2e/features/zero_downtime_rollout/testdata/service-for-route.yaml new file mode 100644 index 00000000000..8944dc7be68 --- /dev/null +++ b/test/kubernetes/e2e/features/zero_downtime_rollout/testdata/service-for-route.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: example-svc +spec: + selector: + app.kubernetes.io/name: nginx + ports: + - protocol: TCP + port: 8080 + targetPort: http-web-svc +--- +apiVersion: v1 +kind: Pod +metadata: + name: nginx + labels: + app.kubernetes.io/name: nginx +spec: + containers: + - name: nginx + image: nginx:stable + ports: + - containerPort: 80 + name: http-web-svc diff --git a/test/kubernetes/e2e/features/zero_downtime_rollout/types.go b/test/kubernetes/e2e/features/zero_downtime_rollout/types.go new file mode 100644 index 00000000000..286e5865239 --- /dev/null +++ b/test/kubernetes/e2e/features/zero_downtime_rollout/types.go @@ -0,0 +1,43 @@ +package zero_downtime_rollout + +import ( + "path/filepath" + + "github.com/solo-io/gloo/test/kubernetes/e2e/defaults" + "github.com/solo-io/gloo/test/kubernetes/e2e/tests/base" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/solo-io/skv2/codegen/util" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var ( + routeWithServiceManifest = filepath.Join(util.MustGetThisDir(), "testdata", "route-with-service.yaml") + serviceManifest = filepath.Join(util.MustGetThisDir(), "testdata", "service-for-route.yaml") + + glooProxyObjectMeta = metav1.ObjectMeta{ + Name: "gloo-proxy-gw", + Namespace: "default", + } + proxyDeployment = &appsv1.Deployment{ObjectMeta: glooProxyObjectMeta} + proxyService = &corev1.Service{ObjectMeta: glooProxyObjectMeta} + + heyPod = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "hey", + Namespace: "hey", + }, + } + + zeroDowntimeTestCases = map[string]*base.TestCase{ + "TestZeroDowntimeRollout": { + SimpleTestCase: base.SimpleTestCase{ + Manifests: []string{defaults.CurlPodManifest, serviceManifest, routeWithServiceManifest}, + Resources: []client.Object{proxyDeployment, proxyService, defaults.CurlPod, heyPod}, + }, + }, + } +) diff --git a/test/kubernetes/e2e/tests/base/base_suite.go b/test/kubernetes/e2e/tests/base/base_suite.go index 6acf8bf8560..7d5dd011bce 100644 --- a/test/kubernetes/e2e/tests/base/base_suite.go +++ b/test/kubernetes/e2e/tests/base/base_suite.go @@ -2,6 +2,7 @@ package base import ( "context" + "fmt" "slices" "time" @@ -9,6 +10,8 @@ import ( "github.com/solo-io/gloo/test/kubernetes/e2e" "github.com/solo-io/gloo/test/kubernetes/testutils/helper" "github.com/stretchr/testify/suite" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -86,6 +89,14 @@ func (s *BaseTestingSuite) SetupSuite() { // Ensure the resources exist if s.Setup.Resources != nil { s.TestInstallation.Assertions.EventuallyObjectsExist(s.Ctx, s.Setup.Resources...) + + for _, resource := range s.Setup.Resources { + if pod, ok := resource.(*corev1.Pod); ok { + s.TestInstallation.Assertions.EventuallyPodsRunning(s.Ctx, pod.Namespace, v1.ListOptions{ + LabelSelector: fmt.Sprintf("app.kubernetes.io/name=%s", pod.Name), + }) + } + } } if s.Setup.UpgradeValues != "" { @@ -166,6 +177,15 @@ func (s *BaseTestingSuite) BeforeTest(suiteName, testName string) { }, 10*time.Second, 1*time.Second).Should(gomega.Succeed(), "can apply "+manifest) } s.TestInstallation.Assertions.EventuallyObjectsExist(s.Ctx, testCase.Resources...) + + for _, resource := range testCase.Resources { + if pod, ok := resource.(*corev1.Pod); ok { + s.TestInstallation.Assertions.EventuallyPodsRunning(s.Ctx, pod.Namespace, v1.ListOptions{ + LabelSelector: fmt.Sprintf("app.kubernetes.io/name=%s", pod.Name), + }) + } + } + } func (s *BaseTestingSuite) AfterTest(suiteName, testName string) { diff --git a/test/kubernetes/e2e/tests/manifests/profiles/kubernetes-gateway.yaml b/test/kubernetes/e2e/tests/manifests/profiles/kubernetes-gateway.yaml index 89067f667a6..eaeb8c1e1c9 100644 --- a/test/kubernetes/e2e/tests/manifests/profiles/kubernetes-gateway.yaml +++ b/test/kubernetes/e2e/tests/manifests/profiles/kubernetes-gateway.yaml @@ -17,10 +17,10 @@ kubeGateway: gatewayParameters: glooGateway: podTemplate: - terminationGracePeriodSeconds: 7 + terminationGracePeriodSeconds: 5 gracefulShutdown: enabled: true - sleepTimeSeconds: 5 + sleepTimeSeconds: 2 probes: true livenessProbeEnabled: true diff --git a/test/kubernetes/e2e/tests/zero_downtime_test.go b/test/kubernetes/e2e/tests/zero_downtime_test.go new file mode 100644 index 00000000000..b7eca8c41a7 --- /dev/null +++ b/test/kubernetes/e2e/tests/zero_downtime_test.go @@ -0,0 +1,52 @@ +package tests_test + +import ( + "context" + "os" + "testing" + "time" + + "github.com/solo-io/gloo/pkg/utils/envutils" + "github.com/solo-io/gloo/test/kubernetes/e2e" + . "github.com/solo-io/gloo/test/kubernetes/e2e/tests" + "github.com/solo-io/gloo/test/kubernetes/testutils/gloogateway" + "github.com/solo-io/gloo/test/testutils" +) + +func TestZeroDowntimeRollout(t *testing.T) { + ctx := context.Background() + installNs, nsEnvPredefined := envutils.LookupOrDefault(testutils.InstallNamespace, "zero-downtime") + testInstallation := e2e.CreateTestInstallation( + t, + &gloogateway.Context{ + InstallNamespace: installNs, + ProfileValuesManifestFile: e2e.KubernetesGatewayProfilePath, + ValuesManifestFile: e2e.EmptyValuesManifestPath, + }, + ) + + testHelper := e2e.MustTestHelper(ctx, testInstallation) + + // Set the env to the install namespace if it is not already set + if !nsEnvPredefined { + os.Setenv(testutils.InstallNamespace, installNs) + } + + // We register the cleanup function _before_ we actually perform the installation. + // This allows us to uninstall Gloo Gateway, in case the original installation only completed partially + t.Cleanup(func() { + if !nsEnvPredefined { + os.Unsetenv(testutils.InstallNamespace) + } + if t.Failed() { + testInstallation.PreFailHandler(ctx) + } + + testInstallation.UninstallGlooGatewayWithTestHelper(ctx, testHelper) + }) + + // Install Gloo Gateway with correct validation settings + testInstallation.InstallGlooGatewayWithTestHelper(ctx, testHelper, 5*time.Minute) + + ZeroDowntimeRolloutSuiteRunner().Run(ctx, t, testInstallation) +} diff --git a/test/kubernetes/e2e/tests/zero_downtime_tests.go b/test/kubernetes/e2e/tests/zero_downtime_tests.go new file mode 100644 index 00000000000..3a502380169 --- /dev/null +++ b/test/kubernetes/e2e/tests/zero_downtime_tests.go @@ -0,0 +1,12 @@ +package tests + +import ( + "github.com/solo-io/gloo/test/kubernetes/e2e" + "github.com/solo-io/gloo/test/kubernetes/e2e/features/zero_downtime_rollout" +) + +func ZeroDowntimeRolloutSuiteRunner() e2e.SuiteRunner { + zeroDowntimeSuiteRunner := e2e.NewSuiteRunner(false) + zeroDowntimeSuiteRunner.Register("ZeroDowntimeRollout", zero_downtime_rollout.NewTestingSuite) + return zeroDowntimeSuiteRunner +}