Skip to content

Commit

Permalink
ctrd: support to restart containerd if it is dead
Browse files Browse the repository at this point in the history
Signed-off-by: Wei Fu <fuweid89@gmail.com>
  • Loading branch information
fuweid authored and allencloud committed May 6, 2019
1 parent 921464b commit 461d7d9
Showing 1 changed file with 90 additions and 3 deletions.
93 changes: 90 additions & 3 deletions ctrd/supervisord/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ const (
// stopTimeout used to send SIGTERM to containerd in limit time to shutdown containerd
// if the containerd is still alive, Stop action will send SIGKILL
stopTimeout = 15 * time.Second

// delayRetryTimeout is used to hold for a while if the restart
// containerd fails
delayRetryTimeout = 500 * time.Millisecond
)

// Opt is used to modify the daemon setting.
Expand All @@ -48,6 +52,8 @@ type Daemon struct {
rootDir string
stateDir string
logger *logrus.Entry
waitCh chan struct{}
stopCh chan struct{}
}

// Address returns containerd grpc address.
Expand All @@ -74,6 +80,7 @@ func Start(ctx context.Context, rootDir, stateDir string, opts ...Opt) (*Daemon,
rootDir: rootDir,
stateDir: stateDir,
logger: logrus.WithField("module", "ctrd-supervisord"),
stopCh: make(chan struct{}),
}

for _, opt := range opts {
Expand All @@ -96,11 +103,17 @@ func Start(ctx context.Context, rootDir, stateDir string, opts ...Opt) (*Daemon,
if err := d.healthPostCheck(); err != nil {
return nil, err
}
d.logger.WithField("containerd-pid", d.pid).Infof("success to start containerd")

go d.monitor()
return d, nil
}

// Stop stops the containerd in 15 seconds.
func (d *Daemon) Stop() error {
// stop the monitor
close(d.stopCh)

if d.pid != -1 {
syscall.Kill(d.pid, syscall.SIGTERM)

Expand Down Expand Up @@ -157,11 +170,12 @@ func (d *Daemon) healthPostCheck() error {
}

client.Close()
d.logger.WithField("containerd-pid", d.pid).Infof("success to start containerd")
return nil
}

if utils.IsProcessAlive(d.pid) {
if ok, err := d.isContainerdProcess(d.pid); err != nil {
d.logger.Warnf("failed to get containerd process status by pid %v: %v", d.pid, err)
} else if ok {
d.logger.WithField("pid", d.pid).Warnf("try to shutdown containerd because failed to connect containerd")
utils.KillProcess(d.pid)
}
Expand Down Expand Up @@ -239,10 +253,13 @@ func (d *Daemon) runContainerd() error {
}
}

pidChan := make(chan int)
d.waitCh = make(chan struct{})
// run and wait the containerd process
pidChan := make(chan int)
go func() {
runtime.LockOSThread()
defer close(d.waitCh)

if err := cmd.Start(); err != nil {
logrus.Errorf("containerd failed to start: %v", err)
pidChan <- -1
Expand All @@ -265,6 +282,76 @@ func (d *Daemon) runContainerd() error {
return nil
}

// monitor will try to restart containerd if containerd has been killed or
// panic.
//
// NOTE: if retry time is too much and restart still fails, the monitor
// will exit the whole process.
func (d *Daemon) monitor() {
var (
maxRetryCount = 10
count = 0
)

for {
select {
case <-d.stopCh:
d.logger.Info("receiving stop containerd action and stop monitor")
return
default:
}

if count > maxRetryCount {
d.logger.Warnf("failed to restart containerd in time and exit whole process")
os.Exit(1)
}

pid, err := d.getContainerdPid()
if err != nil {
d.logger.Warnf("failed to get containerd pid and will retry it again: %v", err)
count++
continue
}

if pid == -1 {
if d.waitCh != nil {
select {
case <-d.waitCh:
select {
case <-d.stopCh:
d.logger.Info("receiving stop containerd action and stop monitor")
return
default:
}
case <-d.stopCh:
d.logger.Info("receiving stop containerd action and stop monitor")
return
}
}

count++
if err := d.runContainerd(); err != nil {
d.logger.Warnf("failed to restart containerd and will retry it again: %v", err)
time.Sleep(delayRetryTimeout)
continue
}
}

if err := d.healthPostCheck(); err != nil {
d.logger.Warn("failed to do health check and will retry it again")
count++
time.Sleep(delayRetryTimeout)
continue
}

if count != 0 {
count = 0
d.logger.WithField("containerd-pid", d.pid).Infof("success to start containerd")
}
time.Sleep(delayRetryTimeout)
}
}

func (d *Daemon) setContainerdPid(pid int) error {
d.pid = pid
return ioutil.WriteFile(d.pidPath(), []byte(fmt.Sprintf("%d", d.pid)), 0660)
Expand Down

0 comments on commit 461d7d9

Please sign in to comment.