Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ctrd: support to restart containerd if it is dead #2829

Merged
merged 1 commit into from
May 6, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 90 additions & 3 deletions ctrd/supervisord/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ const (
// stopTimeout used to send SIGTERM to containerd in limit time to shutdown containerd
// if the containerd is still alive, Stop action will send SIGKILL
stopTimeout = 15 * time.Second

// delayRetryTimeout is used to hold for a while if the restart
// containerd fails
delayRetryTimeout = 500 * time.Millisecond
)

// Opt is used to modify the daemon setting.
Expand All @@ -48,6 +52,8 @@ type Daemon struct {
rootDir string
stateDir string
logger *logrus.Entry
waitCh chan struct{}
stopCh chan struct{}
}

// Address returns containerd grpc address.
Expand All @@ -74,6 +80,7 @@ func Start(ctx context.Context, rootDir, stateDir string, opts ...Opt) (*Daemon,
rootDir: rootDir,
stateDir: stateDir,
logger: logrus.WithField("module", "ctrd-supervisord"),
stopCh: make(chan struct{}),
}

for _, opt := range opts {
Expand All @@ -96,11 +103,17 @@ func Start(ctx context.Context, rootDir, stateDir string, opts ...Opt) (*Daemon,
if err := d.healthPostCheck(); err != nil {
return nil, err
}
d.logger.WithField("containerd-pid", d.pid).Infof("success to start containerd")

go d.monitor()
return d, nil
}

// Stop stops the containerd in 15 seconds.
func (d *Daemon) Stop() error {
// stop the monitor
close(d.stopCh)

if d.pid != -1 {
syscall.Kill(d.pid, syscall.SIGTERM)

Expand Down Expand Up @@ -157,11 +170,12 @@ func (d *Daemon) healthPostCheck() error {
}

client.Close()
d.logger.WithField("containerd-pid", d.pid).Infof("success to start containerd")
return nil
}

if utils.IsProcessAlive(d.pid) {
if ok, err := d.isContainerdProcess(d.pid); err != nil {
d.logger.Warnf("failed to get containerd process status by pid %v: %v", d.pid, err)
} else if ok {
d.logger.WithField("pid", d.pid).Warnf("try to shutdown containerd because failed to connect containerd")
utils.KillProcess(d.pid)
}
Expand Down Expand Up @@ -239,10 +253,13 @@ func (d *Daemon) runContainerd() error {
}
}

pidChan := make(chan int)
d.waitCh = make(chan struct{})
// run and wait the containerd process
pidChan := make(chan int)
go func() {
runtime.LockOSThread()
defer close(d.waitCh)

if err := cmd.Start(); err != nil {
logrus.Errorf("containerd failed to start: %v", err)
pidChan <- -1
Expand All @@ -265,6 +282,76 @@ func (d *Daemon) runContainerd() error {
return nil
}

// monitor will try to restart containerd if containerd has been killed or
// panic.
//
// NOTE: if retry time is too much and restart still fails, the monitor
// will exit the whole process.
func (d *Daemon) monitor() {
var (
maxRetryCount = 10
count = 0
)

for {
select {
case <-d.stopCh:
d.logger.Info("receiving stop containerd action and stop monitor")
return
default:
}

if count > maxRetryCount {
d.logger.Warnf("failed to restart containerd in time and exit whole process")
os.Exit(1)
}

pid, err := d.getContainerdPid()
if err != nil {
d.logger.Warnf("failed to get containerd pid and will retry it again: %v", err)
count++
continue
}

if pid == -1 {
if d.waitCh != nil {
select {
case <-d.waitCh:
select {
case <-d.stopCh:
d.logger.Info("receiving stop containerd action and stop monitor")
return
default:
}
case <-d.stopCh:
d.logger.Info("receiving stop containerd action and stop monitor")
return
}
}

count++
if err := d.runContainerd(); err != nil {
d.logger.Warnf("failed to restart containerd and will retry it again: %v", err)
time.Sleep(delayRetryTimeout)
continue
}
}

if err := d.healthPostCheck(); err != nil {
d.logger.Warn("failed to do health check and will retry it again")
count++
time.Sleep(delayRetryTimeout)
continue
}

if count != 0 {
count = 0
d.logger.WithField("containerd-pid", d.pid).Infof("success to start containerd")
}
time.Sleep(delayRetryTimeout)
}
}

func (d *Daemon) setContainerdPid(pid int) error {
d.pid = pid
return ioutil.WriteFile(d.pidPath(), []byte(fmt.Sprintf("%d", d.pid)), 0660)
Expand Down