diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a644a9023..3bc0c2a66 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -73,14 +73,14 @@ steps: - make test-in-docker timeout_in_minutes: 10 - - label: ":rotating_light: :running_shirt_with_sash: runtime isolated tests" + - label: ":running: runtime isolated tests" agents: queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}" distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}" hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}" env: DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER" - NUMBER_OF_VMS: 100 + NUMBER_OF_VMS: 20 EXTRAGOARGS: "-v -count=1 -race -timeout=1h" FICD_DM_VOLUME_GROUP: fcci-vg artifact_paths: @@ -88,6 +88,23 @@ steps: command: - make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_runtime + - label: ":weight_lifter: running stress tests" + concurrency_group: stress + concurrency: 1 + agents: + queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}" + distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}" + hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}" + env: + DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER" + NUMBER_OF_VMS: 100 + EXTRAGOARGS: "-v -count=1 -race -timeout=1h -run TestMultipleVMs_Isolated" + FICD_DM_VOLUME_GROUP: fcci-vg + artifact_paths: + - "runtime/logs/*" + command: + - make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_stress + - label: ":rotating_light: :exclamation: example tests" agents: queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}" diff --git a/runtime/Makefile b/runtime/Makefile index 47777018f..05836aa3a 100644 --- a/runtime/Makefile +++ b/runtime/Makefile @@ -33,7 +33,7 @@ INTEG_TESTNAMES=$(shell docker run --rm \ --entrypoint=/bin/bash \ --workdir=/src/runtime \ $(FIRECRACKER_CONTAINERD_TEST_IMAGE):$(DOCKER_IMAGE_TAG) \ - -c "go test -list . | sed '$$d' | grep $(INTEG_TEST_SUFFIX)") + -c "go test -list . | sed '$$d' | grep $(INTEG_TEST_SUFFIX)" | grep Multi) all: runtime diff --git a/runtime/service_integ_test.go b/runtime/service_integ_test.go index 64685a7eb..ae99b8950 100644 --- a/runtime/service_integ_test.go +++ b/runtime/service_integ_test.go @@ -242,14 +242,7 @@ func createTapDevice(ctx context.Context, tapName string) error { func TestMultipleVMs_Isolated(t *testing.T) { integtest.Prepare(t) - // This test starts multiple VMs and some may hit firecracker-containerd's - // default timeout. So overriding the timeout to wait longer. - // One hour should be enough to start a VM, regardless of the load of - // the underlying host. - const createVMTimeout = time.Hour - - netns, err := ns.GetCurrentNS() - require.NoError(t, err, "failed to get a namespace") + var err error // numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite numberOfVms := defaultNumberOfVms @@ -257,7 +250,36 @@ func TestMultipleVMs_Isolated(t *testing.T) { numberOfVms, err = strconv.Atoi(str) require.NoError(t, err, "failed to get NUMBER_OF_VMS env") } - t.Logf("TestMultipleVMs_Isolated: will run %d vm's", numberOfVms) + t.Logf("TestMultipleVMs_Isolated: will run up to %d VMs", numberOfVms) + + // We should be able to run 10 VMs without any issues. + if numberOfVms <= 10 { + testMultipleVMs(t, 10) + return + } + + // We have issues running 100 VMs (see #581). + // Incrementally increase the number of VMs to find the breaking point. + for n := 0; n <= numberOfVms; n += 10 { + success := t.Run(fmt.Sprintf("VMs=%d", n), func(t *testing.T) { + testMultipleVMs(t, n) + }) + if !success { + // If running N VMs doesn't work, no point to go further. + return + } + } +} + +func testMultipleVMs(t *testing.T, count int) { + // This test starts multiple VMs and some may hit firecracker-containerd's + // default timeout. So overriding the timeout to wait longer. + // One hour should be enough to start a VM, regardless of the load of + // the underlying host. + const createVMTimeout = 1 * time.Hour + + netns, err := ns.GetCurrentNS() + require.NoError(t, err, "failed to get a namespace") tapPrefix := os.Getenv(tapPrefixEnvName) @@ -303,7 +325,7 @@ func TestMultipleVMs_Isolated(t *testing.T) { // container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container // print the mac address it sees inside its VM. vmEg, vmEgCtx := errgroup.WithContext(testCtx) - for i := 0; i < numberOfVms; i++ { + for i := 0; i < count; i++ { caseTypeNumber := i % len(cases) vmID := i c := cases[caseTypeNumber] @@ -349,6 +371,7 @@ func TestMultipleVMs_Isolated(t *testing.T) { if err != nil { return err } + defer fcClient.Close() resp, createVMErr := fcClient.CreateVM(ctx, req) if createVMErr != nil { @@ -423,12 +446,10 @@ func TestMultipleVMs_Isolated(t *testing.T) { if err != nil { return fmt.Errorf("unexpected error from the containers in VM %d: %w", vmID, err) } + t.Logf("all containers in VM %d are stopped", vmID) _, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: strconv.Itoa(vmID), TimeoutSeconds: 5}) - if err != nil { - return err - } - return nil + return err } vmEg.Go(func() error { @@ -478,7 +499,7 @@ func testMultipleExecs( if err != nil { return err } - defer newContainer.Delete(ctx) + defer newContainer.Delete(ctx, containerd.WithSnapshotCleanup) var taskStdout bytes.Buffer var taskStderr bytes.Buffer