From 48c666ebab5d49d985cd365f1a3ef43499cc3899 Mon Sep 17 00:00:00 2001 From: Gabriel Adrian Samfira Date: Thu, 23 Dec 2021 13:19:57 +0200 Subject: [PATCH 1/2] Fix Range() iteration The function passed into the Range function of sync.Map will stop the iteration if false is returned. This commit makes sure we iterate through all elements in the map. Signed-off-by: Gabriel Adrian Samfira (cherry picked from commit 76881a295eb81d774cc764ce33291b239d0ea731) Signed-off-by: Hamza El-Saawy --- cmd/containerd-shim-runhcs-v1/pod.go | 5 +++-- cmd/containerd-shim-runhcs-v1/task_hcs.go | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/cmd/containerd-shim-runhcs-v1/pod.go b/cmd/containerd-shim-runhcs-v1/pod.go index abd9cbc506..d34f104af1 100644 --- a/cmd/containerd-shim-runhcs-v1/pod.go +++ b/cmd/containerd-shim-runhcs-v1/pod.go @@ -379,8 +379,9 @@ func (p *pod) KillTask(ctx context.Context, tid, eid string, signal uint32, all return wt.KillExec(ctx, eid, signal, all) }) - // iterate all - return false + // Iterate all. Returning false stops the iteration. See: + // https://pkg.go.dev/sync#Map.Range + return true }) } eg.Go(func() error { diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index 98cab90a44..2fc4651cbf 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -535,8 +535,9 @@ func (ht *hcsTask) KillExec(ctx context.Context, eid string, signal uint32, all }).Warn("failed to kill exec in task") } - // iterate all - return false + // Iterate all. Returning false stops the iteration. See: + // https://pkg.go.dev/sync#Map.Range + return true }) } if signal == 0x9 && eid == "" && ht.host != nil { @@ -577,8 +578,9 @@ func (ht *hcsTask) DeleteExec(ctx context.Context, eid string) (int, uint32, tim ex.ForceExit(ctx, 1) } - // iterate next - return false + // Iterate all. Returning false stops the iteration. See: + // https://pkg.go.dev/sync#Map.Range + return true }) } switch state := e.State(); state { @@ -616,8 +618,9 @@ func (ht *hcsTask) Pids(ctx context.Context) ([]runhcsopts.ProcessDetails, error ex := value.(shimExec) pidMap[ex.Pid()] = ex.ID() - // Iterate all - return false + // Iterate all. Returning false stops the iteration. See: + // https://pkg.go.dev/sync#Map.Range + return true }) pidMap[ht.init.Pid()] = ht.init.ID() @@ -698,8 +701,9 @@ func (ht *hcsTask) waitForHostExit() { ex := value.(shimExec) ex.ForceExit(ctx, 1) - // iterate all - return false + // Iterate all. Returning false stops the iteration. See: + // https://pkg.go.dev/sync#Map.Range + return true }) ht.init.ForceExit(ctx, 1) ht.closeHost(ctx) From c82af1459f0dd9b00e00083660c1ba900041519f Mon Sep 17 00:00:00 2001 From: Gabriel Adrian Samfira Date: Mon, 20 Dec 2021 17:57:58 +0200 Subject: [PATCH 2/2] Wait for waitInitExit() to return This change gives waitInitExit() a chance to cleanup resource when DeleteExec() is called, before returning. This should fix situations where the shim exits before releasing container resources. Signed-off-by: Gabriel Adrian Samfira (cherry picked from commit a6edb2596b408a6476e8a0b5b1b1b830423b4a55) Signed-off-by: Hamza El-Saawy --- cmd/containerd-shim-runhcs-v1/task_hcs.go | 35 +++++++++++++++++++ .../task_hcs_test.go | 12 +++++++ 2 files changed, 47 insertions(+) diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index 2fc4651cbf..5af5443cfa 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -589,6 +589,41 @@ func (ht *hcsTask) DeleteExec(ctx context.Context, eid string) (int, uint32, tim case shimExecStateRunning: return 0, 0, time.Time{}, newExecInvalidStateError(ht.id, eid, state, "delete") } + + if eid == "" { + // We are killing the init task, so we expect the container to be + // stopped after this. + // + // The task process may have already exited, and the status set to + // shimExecStateExited, but resources may still be in the process + // of being cleaned up. Wait for ht.closed to be closed. This signals + // that waitInitExit() has finished destroying container resources, + // and layers were umounted. + // If the shim exits before resources are cleaned up, those resources + // will remain locked and untracked, which leads to lingering sandboxes + // and container resources like base vhdx. + select { + case <-time.After(30 * time.Second): + log.G(ctx).Error("timed out waiting for resource cleanup") + return 0, 0, time.Time{}, errors.Wrap(hcs.ErrTimeout, "waiting for container resource cleanup") + case <-ht.closed: + } + + // The init task has now exited. A ForceExit() has already been sent to + // execs. Cleanup execs and continue. + ht.execs.Range(func(key, value interface{}) bool { + if key == "" { + // Iterate next. + return true + } + ht.execs.Delete(key) + + // Iterate all. Returning false stops the iteration. See: + // https://pkg.go.dev/sync#Map.Range + return true + }) + } + status := e.Status() if eid != "" { ht.execs.Delete(eid) diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs_test.go b/cmd/containerd-shim-runhcs-v1/task_hcs_test.go index 798af38556..cbeeaabda7 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs_test.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs_test.go @@ -161,6 +161,8 @@ func Test_hcsTask_DeleteExec_InitExecID_CreatedState_Success(t *testing.T) { // remove the 2nd exec so we just check without it. lt.execs.Delete(second.id) + // Simulate waitInitExit() closing the host + close(lt.closed) // try to delete the init exec pid, status, at, err := lt.DeleteExec(context.TODO(), "") @@ -178,6 +180,8 @@ func Test_hcsTask_DeleteExec_InitExecID_RunningState_Error(t *testing.T) { // Start the init exec _ = init.Start(context.TODO()) + // Simulate waitInitExit() closing the host + close(lt.closed) // try to delete the init exec pid, status, at, err := lt.DeleteExec(context.TODO(), "") @@ -192,6 +196,8 @@ func Test_hcsTask_DeleteExec_InitExecID_ExitedState_Success(t *testing.T) { _ = init.Kill(context.TODO(), 0xf) + // Simulate waitInitExit() closing the host + close(lt.closed) // try to delete the init exec pid, status, at, err := lt.DeleteExec(context.TODO(), "") @@ -207,6 +213,8 @@ func Test_hcsTask_DeleteExec_InitExecID_2ndExec_CreatedState_Error(t *testing.T) // start the init exec (required to have 2nd exec) _ = init.Start(context.TODO()) + // Simulate waitInitExit() closing the host + close(lt.closed) // try to delete the init exec pid, status, at, err := lt.DeleteExec(context.TODO(), "") @@ -226,6 +234,8 @@ func Test_hcsTask_DeleteExec_InitExecID_2ndExec_RunningState_Error(t *testing.T) // put the 2nd exec into the running state _ = second.Start(context.TODO()) + // Simulate waitInitExit() closing the host + close(lt.closed) // try to delete the init exec pid, status, at, err := lt.DeleteExec(context.TODO(), "") @@ -244,6 +254,8 @@ func Test_hcsTask_DeleteExec_InitExecID_2ndExec_ExitedState_Success(t *testing.T // put the 2nd exec into the exited state _ = second.Kill(context.TODO(), 0xf) + // Simulate waitInitExit() closing the host + close(lt.closed) // try to delete the init exec pid, status, at, err := lt.DeleteExec(context.TODO(), "")