Skip to content

Commit

Permalink
bugfix: avoid the deadlock when failed to remove invalid sandbox
Browse files Browse the repository at this point in the history
Signed-off-by: YaoZengzeng <yaozengzeng@zju.edu.cn>
  • Loading branch information
YaoZengzeng committed Aug 13, 2018
1 parent 218807c commit fa7315a
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 2 deletions.
10 changes: 9 additions & 1 deletion cri/v1alpha1/cri.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,10 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
defer func() {
// If running sandbox failed, clean up the container.
if retErr != nil {
c.ContainerMgr.Remove(ctx, id, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
err := c.ContainerMgr.Remove(ctx, id, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
if err != nil {
logrus.Errorf("failed to remove the container when running sandbox failed: %v", err)
}
}
}()

Expand Down Expand Up @@ -462,6 +465,11 @@ func (c *CriManager) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandb
return nil, fmt.Errorf("failed to list sandbox: %v", err)
}

sandboxList, err = c.filterInvalidSandboxes(ctx, sandboxList)
if err != nil {
return nil, fmt.Errorf("failed to filter invalid sandboxes: %v", err)
}

sandboxes := make([]*runtime.PodSandbox, 0, len(sandboxList))
for _, s := range sandboxList {
sandbox, err := toCriSandbox(s)
Expand Down
37 changes: 37 additions & 0 deletions cri/v1alpha1/cri_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,43 @@ func toCriSandbox(c *mgr.Container) (*runtime.PodSandbox, error) {
}, nil
}

// It has the possibility that we failed to run the sandbox and it is not being cleaned up.
// Kubelet will use list to get the sandboxes, but will not get the status of the failed pod
// whose meta data has not been put into the Sandbox Store. And Kubelet will keep trying to
// get the status of the failed pod and won't create a new one to replace it. It's a DEAD LOCK.
// Actually Kubelet should not know the existence of invalid pod whose meta data won't be in the
// Sandbox Store. So we could avoid the DEAD LOCK mentioned above.
func (c *CriManager) filterInvalidSandboxes(ctx context.Context, sandboxes []*mgr.Container) ([]*mgr.Container, error) {
validSandboxes, err := c.SandboxStore.Keys()
if err != nil {
return nil, err
}

var result []*mgr.Container
for _, sandbox := range sandboxes {
exist := false
for _, id := range validSandboxes {
if sandbox.ID == id {
exist = true
break
}
}
if exist {
result = append(result, sandbox)
continue
}

status := sandbox.State.Status
// NOTE: what if the worst case that we failed to remove the sandbox and
// it is still running?
if status != apitypes.StatusRunning && status != apitypes.StatusCreated {
// Remove invalid sandbox.
c.ContainerMgr.Remove(ctx, sandbox.ID, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
}
}
return result, nil
}

func filterCRISandboxes(sandboxes []*runtime.PodSandbox, filter *runtime.PodSandboxFilter) []*runtime.PodSandbox {
if filter == nil {
return sandboxes
Expand Down
10 changes: 9 additions & 1 deletion cri/v1alpha2/cri.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,10 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
defer func() {
// If running sandbox failed, clean up the container.
if retErr != nil {
c.ContainerMgr.Remove(ctx, id, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
err := c.ContainerMgr.Remove(ctx, id, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
if err != nil {
logrus.Errorf("failed to remove the container when running sandbox failed: %v", err)
}
}
}()

Expand Down Expand Up @@ -468,6 +471,11 @@ func (c *CriManager) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandb
return nil, fmt.Errorf("failed to list sandbox: %v", err)
}

sandboxList, err = c.filterInvalidSandboxes(ctx, sandboxList)
if err != nil {
return nil, fmt.Errorf("failed to filter invalid sandboxes: %v", err)
}

sandboxes := make([]*runtime.PodSandbox, 0, len(sandboxList))
for _, s := range sandboxList {
sandbox, err := toCriSandbox(s)
Expand Down
36 changes: 36 additions & 0 deletions cri/v1alpha2/cri_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,42 @@ func toCriSandbox(c *mgr.Container) (*runtime.PodSandbox, error) {
}, nil
}

// It has the possibility that we failed to run the sandbox and it is not being cleaned up.
// Kubelet will use list to get the sandboxes, but will not get the status of the failed pod
// whose meta data has not been put into the Sandbox Store. And Kubelet will keep trying to
// get the status of the failed pod and won't create a new one to replace it. It's a DEAD LOCK.
// Actually Kubelet should not know the existence of invalid pod whose meta data won't be in the
// Sandbox Store. So we could avoid the DEAD LOCK mentioned above.
func (c *CriManager) filterInvalidSandboxes(ctx context.Context, sandboxes []*mgr.Container) ([]*mgr.Container, error) {
validSandboxes, err := c.SandboxStore.Keys()
if err != nil {
return nil, err
}

var result []*mgr.Container
for _, sandbox := range sandboxes {
exist := false
for _, id := range validSandboxes {
if sandbox.ID == id {
exist = true
break
}
}
if exist {
result = append(result, sandbox)
continue
}

status := sandbox.State.Status
// NOTE: what if the worst case that we failed to remove the sandbox and
// it is still running?
if status != apitypes.StatusRunning && status != apitypes.StatusCreated {
c.ContainerMgr.Remove(ctx, sandbox.ID, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true})
}
}
return result, nil
}

func filterCRISandboxes(sandboxes []*runtime.PodSandbox, filter *runtime.PodSandboxFilter) []*runtime.PodSandbox {
if filter == nil {
return sandboxes
Expand Down

0 comments on commit fa7315a

Please sign in to comment.