Skip to content

Commit

Permalink
Add reconnect logic for stdio pipes
Browse files Browse the repository at this point in the history
This change adds retry logic on the stdio relay if the server end of the named pipe
disconnects. This is a common case if containerd restarts for example.
The current approach is to make a io.Writer wrapper that handles the
reconnection logic on a write failure if it can be determined that the error
is from a disconnect. A new shim config option is exposed to tailor the retry timeout.

This changes also adds cenkalti/backoff/v4 as a dependency to be used for handling
exponential backoff logic for the stdio connection retry attempts. Retrying
at a fixed interval is a bit naive as all of the shims would potentially
be trying to reconnect to 3 pipes continuously all in <timeout> bursts.
This allows us to space out the connections, set an upper limit on timeout
intervals and add an element of randomness to the retry attempts.

Signed-off-by: Daniel Canter <dcanter@microsoft.com>
  • Loading branch information
dcantah committed Oct 29, 2021
1 parent af3d660 commit 573c137
Show file tree
Hide file tree
Showing 51 changed files with 1,862 additions and 198 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
with:
version: v1.42.1 # Has fixes for stylecheck configuration https://github.com/golangci/golangci-lint/pull/2017/files
args: --timeout=5m -v
only-new-issues: true

verify-main-vendor:
runs-on: 'windows-2019'
Expand All @@ -40,7 +41,7 @@ jobs:
Write-Error "Main modules are not up to date. Please validate your go version >= this job's and run `go mod vendor` followed by `go mod tidy` in the repo root path."
}
exit $process.ExitCode
verify-test-vendor:
runs-on: 'windows-2019'
env:
Expand Down
2 changes: 1 addition & 1 deletion .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ issues:
linters:
- stylecheck
Text: "ST1003:"

- path: cmd\\ncproxy\\nodenetsvc\\
linters:
- stylecheck
Expand Down
156 changes: 96 additions & 60 deletions cmd/containerd-shim-runhcs-v1/options/runhcs.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions cmd/containerd-shim-runhcs-v1/options/runhcs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ message Options {
// logrus log levels: "trace", "debug", "info", "warn", "error", "fatal", "panic". This setting will override
// the `debug` field if both are specified, unless the level specified is also "debug", as these are equivalent.
string log_level = 16;

// io_retry_timeout_in_sec is the timeout in seconds for how long to try and reconnect to an upstream IO provider if a connection is lost.
// The typical example is if Containerd has restarted but is expected to come back online. A 0 for this field is interpreted as an infinite
// timeout.
int32 io_retry_timeout_in_sec = 17;
}

// ProcessDetails contains additional information about a process. This is the additional
Expand Down
57 changes: 40 additions & 17 deletions cmd/containerd-shim-runhcs-v1/task_hcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,6 @@ func newHcsTask(
owner := filepath.Base(os.Args[0])
isTemplate := oci.ParseAnnotationsSaveAsTemplate(ctx, s)

io, err := cmd.NewUpstreamIO(ctx, req.ID, req.Stdout, req.Stderr, req.Stdin, req.Terminal)
if err != nil {
return nil, err
}

var netNS string
if s.Windows != nil &&
s.Windows.Network != nil {
Expand All @@ -186,22 +181,33 @@ func newHcsTask(
shimOpts = v.(*runhcsopts.Options)
}

// Default to an infinite timeout (zero value)
var ioRetryTimeout time.Duration
if shimOpts != nil {
ioRetryTimeout = time.Duration(shimOpts.IoRetryTimeoutInSec) * time.Second
}
io, err := cmd.NewUpstreamIO(ctx, req.ID, req.Stdout, req.Stderr, req.Stdin, req.Terminal, ioRetryTimeout)
if err != nil {
return nil, err
}

container, resources, err := createContainer(ctx, req.ID, owner, netNS, s, parent, shimOpts)
if err != nil {
return nil, err
}

ht := &hcsTask{
events: events,
id: req.ID,
isWCOW: oci.IsWCOW(s),
c: container,
cr: resources,
ownsHost: ownsParent,
host: parent,
closed: make(chan struct{}),
taskSpec: s,
isTemplate: isTemplate,
events: events,
id: req.ID,
isWCOW: oci.IsWCOW(s),
c: container,
cr: resources,
ownsHost: ownsParent,
host: parent,
closed: make(chan struct{}),
taskSpec: s,
isTemplate: isTemplate,
ioRetryTimeout: ioRetryTimeout,
}
ht.init = newHcsExec(
ctx,
Expand Down Expand Up @@ -279,7 +285,21 @@ func newClonedHcsTask(
return nil, fmt.Errorf("cloned task can only be created inside a windows host")
}

io, err := cmd.NewNpipeIO(ctx, req.Stdin, req.Stdout, req.Stderr, req.Terminal)
var shimOpts *runhcsopts.Options
if req.Options != nil {
v, err := typeurl.UnmarshalAny(req.Options)
if err != nil {
return nil, err
}
shimOpts = v.(*runhcsopts.Options)
}

// Default to an infinite timeout (zero value)
var ioRetryTimeout time.Duration
if shimOpts != nil {
ioRetryTimeout = time.Duration(shimOpts.IoRetryTimeoutInSec) * time.Second
}
io, err := cmd.NewNpipeIO(ctx, req.Stdin, req.Stdout, req.Stderr, req.Terminal, ioRetryTimeout)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -433,6 +453,9 @@ type hcsTask struct {

// taskSpec represents the spec/configuration for this task.
taskSpec *specs.Spec

// ioRetryTimeout is the time for how long to try reconnecting to stdio pipes from containerd.
ioRetryTimeout time.Duration
}

func (ht *hcsTask) ID() string {
Expand All @@ -453,7 +476,7 @@ func (ht *hcsTask) CreateExec(ctx context.Context, req *task.ExecProcessRequest,
return errors.Wrapf(errdefs.ErrFailedPrecondition, "exec: '' in task: '%s' must be running to create additional execs", ht.id)
}

io, err := cmd.NewUpstreamIO(ctx, req.ID, req.Stdout, req.Stderr, req.Stdin, req.Terminal)
io, err := cmd.NewUpstreamIO(ctx, req.ID, req.Stdout, req.Stderr, req.Stdin, req.Terminal, ht.ioRetryTimeout)
if err != nil {
return err
}
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.13
require (
github.com/BurntSushi/toml v0.3.1
github.com/Microsoft/go-winio v0.4.17
github.com/cenkalti/backoff/v4 v4.1.1
github.com/containerd/cgroups v1.0.1
github.com/containerd/console v1.0.2
github.com/containerd/containerd v1.5.7
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ github.com/buger/jsonparser v0.0.0-20180808090653-f4dd9f5a6b44/go.mod h1:bbYlZJ7
github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd/go.mod h1:2oa8nejYd4cQ/b0hMIopN0lCRxU0bueqREvZLWFrtK8=
github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b/go.mod h1:obH5gd0BsqsP2LwDJ9aOkm/6J86V6lyAXCoQWGw3K50=
github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE=
github.com/cenkalti/backoff/v4 v4.1.1 h1:G2HAfAmvm/GcKan2oOQpBXOd2tT2G57ZnZGWa1PxPBQ=
github.com/cenkalti/backoff/v4 v4.1.1/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
Expand Down
Loading

0 comments on commit 573c137

Please sign in to comment.