Skip to content

Commit

Permalink
Reduce the number of VMs on TestMultipleVMs_Isolated
Browse files Browse the repository at this point in the history
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <katokazu@amazon.com>
  • Loading branch information
kzys committed May 23, 2022
1 parent 956c44a commit 46f3c1e
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 18 deletions.
21 changes: 19 additions & 2 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,21 +73,38 @@ steps:
- make test-in-docker
timeout_in_minutes: 10

- label: ":rotating_light: :running_shirt_with_sash: runtime isolated tests"
- label: ":running: runtime isolated tests"
agents:
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}"
env:
DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER"
NUMBER_OF_VMS: 100
NUMBER_OF_VMS: 20
EXTRAGOARGS: "-v -count=1 -race -timeout=1h"
FICD_DM_VOLUME_GROUP: fcci-vg
artifact_paths:
- "runtime/logs/*"
command:
- make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_runtime

- label: ":weight_lifter: running stress tests"
concurrency_group: stress
concurrency: 1
agents:
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}"
env:
DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER"
NUMBER_OF_VMS: 100
EXTRAGOARGS: "-v -count=1 -race -timeout=1h -run TestMultipleVMs_Isolated"
FICD_DM_VOLUME_GROUP: fcci-vg
artifact_paths:
- "runtime/logs/*"
command:
- make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_stress

- label: ":rotating_light: :exclamation: example tests"
agents:
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
Expand Down
2 changes: 1 addition & 1 deletion runtime/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ INTEG_TESTNAMES=$(shell docker run --rm \
--entrypoint=/bin/bash \
--workdir=/src/runtime \
$(FIRECRACKER_CONTAINERD_TEST_IMAGE):$(DOCKER_IMAGE_TAG) \
-c "go test -list . | sed '$$d' | grep $(INTEG_TEST_SUFFIX)")
-c "go test -list . | sed '$$d' | grep $(INTEG_TEST_SUFFIX)" | grep Multi)

all: runtime

Expand Down
51 changes: 36 additions & 15 deletions runtime/service_integ_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,22 +242,44 @@ func createTapDevice(ctx context.Context, tapName string) error {
func TestMultipleVMs_Isolated(t *testing.T) {
integtest.Prepare(t)

// This test starts multiple VMs and some may hit firecracker-containerd's
// default timeout. So overriding the timeout to wait longer.
// One hour should be enough to start a VM, regardless of the load of
// the underlying host.
const createVMTimeout = time.Hour

netns, err := ns.GetCurrentNS()
require.NoError(t, err, "failed to get a namespace")
var err error

// numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite
numberOfVms := defaultNumberOfVms
if str := os.Getenv(numberOfVmsEnvName); str != "" {
numberOfVms, err = strconv.Atoi(str)
require.NoError(t, err, "failed to get NUMBER_OF_VMS env")
}
t.Logf("TestMultipleVMs_Isolated: will run %d vm's", numberOfVms)
t.Logf("TestMultipleVMs_Isolated: will run up to %d VMs", numberOfVms)

// We should be able to run 10 VMs without any issues.
if numberOfVms <= 10 {
testMultipleVMs(t, 10)
return
}

// We have issues running 100 VMs (see #581).
// Incrementally increase the number of VMs to find the breaking point.
for n := 0; n <= numberOfVms; n += 10 {
success := t.Run(fmt.Sprintf("VMs=%d", n), func(t *testing.T) {
testMultipleVMs(t, n)
})
if !success {
// If running N VMs doesn't work, no point to go further.
return
}
}
}

func testMultipleVMs(t *testing.T, count int) {
// This test starts multiple VMs and some may hit firecracker-containerd's
// default timeout. So overriding the timeout to wait longer.
// One hour should be enough to start a VM, regardless of the load of
// the underlying host.
const createVMTimeout = 1 * time.Hour

netns, err := ns.GetCurrentNS()
require.NoError(t, err, "failed to get a namespace")

tapPrefix := os.Getenv(tapPrefixEnvName)

Expand Down Expand Up @@ -303,7 +325,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
// container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container
// print the mac address it sees inside its VM.
vmEg, vmEgCtx := errgroup.WithContext(testCtx)
for i := 0; i < numberOfVms; i++ {
for i := 0; i < count; i++ {
caseTypeNumber := i % len(cases)
vmID := i
c := cases[caseTypeNumber]
Expand Down Expand Up @@ -349,6 +371,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
if err != nil {
return err
}
defer fcClient.Close()

resp, createVMErr := fcClient.CreateVM(ctx, req)
if createVMErr != nil {
Expand Down Expand Up @@ -423,12 +446,10 @@ func TestMultipleVMs_Isolated(t *testing.T) {
if err != nil {
return fmt.Errorf("unexpected error from the containers in VM %d: %w", vmID, err)
}
t.Logf("all containers in VM %d are stopped", vmID)

_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: strconv.Itoa(vmID), TimeoutSeconds: 5})
if err != nil {
return err
}
return nil
return err
}

vmEg.Go(func() error {
Expand Down Expand Up @@ -478,7 +499,7 @@ func testMultipleExecs(
if err != nil {
return err
}
defer newContainer.Delete(ctx)
defer newContainer.Delete(ctx, containerd.WithSnapshotCleanup)

var taskStdout bytes.Buffer
var taskStderr bytes.Buffer
Expand Down

0 comments on commit 46f3c1e

Please sign in to comment.