Skip to content

Commit

Permalink
Add Close/WaitCtx to UtilityVM & System
Browse files Browse the repository at this point in the history
Add `CloseCtx` and `WaitCtx` methods to `UtilityVM` and `System`, which
accept a context parameter and return if the context is canceled.

This is intended to allow benchmark iterations to time out and prevent
them from spending the majority of their time waiting.

However, the added benefit is that tracing information (trace and span
ID) will now be passed along to the `Wait` and `Close` logs (and
underlying HCS call spans).

Additionally, fix a bug in `(*UtilityVM).Close`, where, if the uVM was
created but not started, then the `(*UtilityVM).Wait` call will
hang indefinitely.
Fix is to wait initially on the system to close, close the IO output
handler, and then wait on the uVM.

Combine LCOW uVM benchmarks together (similar to LCOW container) to
simplify benchmark name formatting.

Relies on #1875

Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com>
  • Loading branch information
helsaawy committed Oct 17, 2023
1 parent aad7467 commit 5758c44
Show file tree
Hide file tree
Showing 17 changed files with 443 additions and 191 deletions.
3 changes: 2 additions & 1 deletion cmd/containerd-shim-runhcs-v1/task_hcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ func (ht *hcsTask) waitForHostExit() {
defer span.End()
span.AddAttributes(trace.StringAttribute("tid", ht.id))

err := ht.host.Wait()
err := ht.host.WaitCtx(ctx)
if err != nil {
log.G(ctx).WithError(err).Error("failed to wait for host virtual machine exit")
} else {
Expand Down Expand Up @@ -661,6 +661,7 @@ func (ht *hcsTask) close(ctx context.Context) {
// testing.
if ht.c != nil {
// Do our best attempt to tear down the container.
// TODO: unify timeout select statements and use [ht.c.WaitCtx] and [context.WithTimeout]
var werr error
ch := make(chan struct{})
go func() {
Expand Down
2 changes: 1 addition & 1 deletion cmd/containerd-shim-runhcs-v1/task_wcow_podsandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ func (wpst *wcowPodSandboxTask) waitParentExit() {
defer span.End()
span.AddAttributes(trace.StringAttribute("tid", wpst.id))

werr := wpst.host.Wait()
werr := wpst.host.WaitCtx(ctx)
if werr != nil {
log.G(ctx).WithError(werr).Error("parent wait failed")
}
Expand Down
31 changes: 25 additions & 6 deletions internal/hcs/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,11 +304,22 @@ func (computeSystem *System) WaitError() error {
return computeSystem.waitError
}

// Wait synchronously waits for the compute system to shutdown or terminate. If
// the compute system has already exited returns the previous error (if any).
// Wait synchronously waits for the compute system to shutdown or terminate.
// If the compute system has already exited returns the previous error (if any).
func (computeSystem *System) Wait() error {
<-computeSystem.WaitChannel()
return computeSystem.WaitError()
return computeSystem.WaitCtx(context.Background())
}

// WaitCtx synchronously waits for the compute system to shutdown or terminate, or the context to be cancelled.
//
// See [System.Wait] for more information.
func (computeSystem *System) WaitCtx(ctx context.Context) error {
select {
case <-computeSystem.WaitChannel():
return computeSystem.WaitError()
case <-ctx.Done():
return ctx.Err()
}
}

// stopped returns true if the compute system stopped.
Expand Down Expand Up @@ -735,9 +746,17 @@ func (computeSystem *System) OpenProcess(ctx context.Context, pid int) (*Process
}

// Close cleans up any state associated with the compute system but does not terminate or wait for it.
func (computeSystem *System) Close() (err error) {
func (computeSystem *System) Close() error {
return computeSystem.CloseCtx(context.Background())
}

// CloseCtx is similar to [System.Close], but accepts a context.
//
// The context is used for all operations, including waits, so timeouts/cancellations may prevent
// proper system cleanup.
func (computeSystem *System) CloseCtx(ctx context.Context) (err error) {
operation := "hcs::System::Close"
ctx, span := oc.StartSpan(context.Background(), operation)
ctx, span := oc.StartSpan(ctx, operation)
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute("cid", computeSystem.id))
Expand Down
34 changes: 27 additions & 7 deletions internal/uvm/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ func (uvm *UtilityVM) create(ctx context.Context, doc interface{}) error {
defer func() {
if system != nil {
_ = system.Terminate(ctx)
_ = system.Wait()
_ = system.WaitCtx(ctx)
}
}()

Expand All @@ -208,17 +208,29 @@ func (uvm *UtilityVM) create(ctx context.Context, doc interface{}) error {
}

// Close terminates and releases resources associated with the utility VM.
func (uvm *UtilityVM) Close() (err error) {
ctx, span := oc.StartSpan(context.Background(), "uvm::Close")
func (uvm *UtilityVM) Close() error { return uvm.CloseCtx(context.Background()) }

// CloseCtx is similar to [UtilityVM.Close], but accepts a context.
//
// The context is used for all operations, including waits, so timeouts/cancellations may prevent
// proper uVM cleanup.
func (uvm *UtilityVM) CloseCtx(ctx context.Context) (err error) {
ctx, span := oc.StartSpan(ctx, "uvm::Close")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute(logfields.UVMID, uvm.id))

// TODO: check if uVM already closed

windows.Close(uvm.vmmemProcess)

if uvm.hcsSystem != nil {
_ = uvm.hcsSystem.Terminate(ctx)
_ = uvm.Wait()
// uvm.Wait() waits on <-uvm.outputProcessingDone, which may not be closed until below
// (for a Create -> Stop without a Start), or uvm.outputHandler may be blocked on IO and
// take a while to close.
// In either case, we want to wait on the system closing, not IO completion.
_ = uvm.hcsSystem.WaitCtx(ctx)
}

if err := uvm.CloseGCSConnection(); err != nil {
Expand All @@ -234,16 +246,24 @@ func (uvm *UtilityVM) Close() (err error) {
uvm.outputListener = nil
}

if uvm.hcsSystem != nil {
// wait for IO to finish
// [WaitCtx] calls [uvm.hcsSystem.WaitCtx] again, but since we waited on it above already
// it should nop and return without issue.
_ = uvm.WaitCtx(ctx)
}

if uvm.confidentialUVMOptions != nil && uvm.confidentialUVMOptions.GuestStateFile != "" {
vmgsFullPath := filepath.Join(uvm.confidentialUVMOptions.BundleDirectory, uvm.confidentialUVMOptions.GuestStateFile)
log.G(context.Background()).WithField("VMGS file", vmgsFullPath).Debug("removing VMGS file")
e := log.G(ctx).WithField("VMGS file", vmgsFullPath)
e.Debug("removing VMGS file")
if err := os.Remove(vmgsFullPath); err != nil {
log.G(ctx).WithError(err).Error("failed to remove VMGS file")
e.WithError(err).Error("failed to remove VMGS file")
}
}

if uvm.hcsSystem != nil {
return uvm.hcsSystem.Close()
return uvm.hcsSystem.CloseCtx(ctx)
}

return nil
Expand Down
7 changes: 5 additions & 2 deletions internal/uvm/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,16 @@ func (uvm *UtilityVM) Start(ctx context.Context) (err error) {
// use parent context, to prevent 2 minute timout (set above) from overridding terminate operation's
// timeout and erroring out prematurely
_ = uvm.hcsSystem.Terminate(pCtx)
_ = uvm.hcsSystem.Wait()
_ = uvm.hcsSystem.WaitCtx(pCtx)
}
}()

// Start waiting on the utility VM.
go func() {
err := uvm.hcsSystem.Wait()
// the original context may have timeout or propagate a cancellation
// copy the original to prevent it affecting the background wait go routine
cCtx := log.Copy(context.Background(), pCtx)
err := uvm.hcsSystem.WaitCtx(cCtx)
if err == nil {
err = uvm.hcsSystem.ExitError()
}
Expand Down
27 changes: 22 additions & 5 deletions internal/uvm/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,34 @@
package uvm

import (
"github.com/Microsoft/hcsshim/internal/logfields"
"context"
"fmt"

"github.com/sirupsen/logrus"

"github.com/Microsoft/hcsshim/internal/logfields"
)

// Wait waits synchronously for a utility VM to terminate.
func (uvm *UtilityVM) Wait() error {
err := uvm.hcsSystem.Wait()
func (uvm *UtilityVM) Wait() error { return uvm.WaitCtx(context.Background()) }

// Wait waits synchronously for a utility VM to terminate, or the context to be cancelled.
func (uvm *UtilityVM) WaitCtx(ctx context.Context) (err error) {
err = uvm.hcsSystem.WaitCtx(ctx)

logrus.WithField(logfields.UVMID, uvm.id).Debug("uvm exited, waiting for output processing to complete")
e := logrus.WithField(logfields.UVMID, uvm.id)
e.Debug("uvm exited, waiting for output processing to complete")
if uvm.outputProcessingDone != nil {
<-uvm.outputProcessingDone
select {
case <-uvm.outputProcessingDone:
case <-ctx.Done():
if err2 := ctx.Err(); err2 != nil {
e.WithError(err2).Warning("failed to wait on uvm output processing")
if err == nil {
err = fmt.Errorf("failed to wait on uvm output processing: %w", err2)
}
}
}
}

return err
Expand Down
147 changes: 76 additions & 71 deletions test/functional/lcow_bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,93 +9,98 @@ import (

"github.com/Microsoft/hcsshim/osversion"

"github.com/Microsoft/hcsshim/test/internal/util"
"github.com/Microsoft/hcsshim/test/pkg/require"
"github.com/Microsoft/hcsshim/test/pkg/uvm"
)

func BenchmarkLCOW_UVM_Create(b *testing.B) {
func BenchmarkLCOW_UVM(b *testing.B) {
requireFeatures(b, featureLCOW)
require.Build(b, osversion.RS5)

ctx := context.Background()
pCtx := context.Background()

b.StopTimer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
opts := defaultLCOWOptions(b)

b.StartTimer()
vm := uvm.CreateLCOW(ctx, b, opts)
b.Run("Create", func(b *testing.B) {
b.StopTimer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ctx, cancel := context.WithTimeout(pCtx, benchmarkIterationTimeout)

// vm.Close() hangs unless the vm was started
cleanup := uvm.Start(ctx, b, vm)
cleanup()
}
}
opts := defaultLCOWOptions(b)
opts.ID += util.RandNameSuffix(i)

func BenchmarkLCOW_UVM_Start(b *testing.B) {
requireFeatures(b, featureLCOW)
require.Build(b, osversion.RS5)
b.StartTimer()
_, cleanup := uvm.CreateLCOW(ctx, b, opts)
b.StopTimer()

ctx := context.Background()

b.StopTimer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
vm := uvm.CreateLCOW(ctx, b, defaultLCOWOptions(b))

b.StartTimer()
if err := vm.Start(ctx); err != nil {
b.Fatalf("could not start UVM: %v", err)
cleanup(ctx)
cancel()
}
b.StopTimer()

vm.Close()
}
}

func BenchmarkLCOW_UVM_Kill(b *testing.B) {
requireFeatures(b, featureLCOW)
require.Build(b, osversion.RS5)

ctx := context.Background()

b.StopTimer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
vm := uvm.CreateLCOW(ctx, b, defaultLCOWOptions(b))
cleanup := uvm.Start(ctx, b, vm)
})

b.StartTimer()
uvm.Kill(ctx, b, vm)
if err := vm.Wait(); err != nil {
b.Fatalf("could not kill uvm %q: %v", vm.ID(), err)
}
b.Run("Start", func(b *testing.B) {
b.StopTimer()

cleanup()
}
}

func BenchmarkLCOW_UVM_Close(b *testing.B) {
requireFeatures(b, featureLCOW)
require.Build(b, osversion.RS5)

ctx := context.Background()

b.StopTimer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
vm := uvm.CreateLCOW(ctx, b, defaultLCOWOptions(b))
cleanup := uvm.Start(ctx, b, vm)

b.StartTimer()
if err := vm.Close(); err != nil {
b.Fatalf("could not kill uvm %q: %v", vm.ID(), err)
b.ResetTimer()
for i := 0; i < b.N; i++ {
ctx, cancel := context.WithTimeout(pCtx, benchmarkIterationTimeout)

opts := defaultLCOWOptions(b)
opts.ID += util.RandNameSuffix(i)
vm, cleanup := uvm.CreateLCOW(ctx, b, opts)

b.StartTimer()
if err := vm.Start(ctx); err != nil {
b.Fatalf("could not start UVM: %v", err)
}
b.StopTimer()

cleanup(ctx)
cancel()
}
})

b.Run("Kill", func(b *testing.B) {
b.StopTimer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ctx, cancel := context.WithTimeout(pCtx, benchmarkIterationTimeout)

opts := defaultLCOWOptions(b)
opts.ID += util.RandNameSuffix(i)
vm, cleanup := uvm.CreateLCOW(ctx, b, opts)
uvm.Start(ctx, b, vm)

b.StartTimer()
uvm.Kill(ctx, b, vm)
if err := vm.WaitCtx(ctx); err != nil {
b.Fatalf("could not kill uvm %q: %v", vm.ID(), err)
}
b.StopTimer()

cleanup(ctx)
cancel()
}
})

cleanup()
}
b.Run("Close", func(b *testing.B) {
b.StopTimer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ctx, cancel := context.WithTimeout(pCtx, benchmarkIterationTimeout)

opts := defaultLCOWOptions(b)
opts.ID += util.RandNameSuffix(i)
vm, cleanup := uvm.CreateLCOW(ctx, b, opts)
uvm.Start(ctx, b, vm)

b.StartTimer()
if err := vm.Close(); err != nil {
b.Fatalf("could not kill uvm %q: %v", vm.ID(), err)
}
b.StopTimer()

cleanup(ctx)
cancel()
}
})
}
Loading

0 comments on commit 5758c44

Please sign in to comment.