Skip to content

Commit

Permalink
Merge pull request #665 from Mirantis/jell/handling_cni
Browse files Browse the repository at this point in the history
Correct the way of CNI ADD handling
  • Loading branch information
lukaszo authored May 8, 2018
2 parents bf73b9f + 2f14e9d commit 5d34e2a
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 14 deletions.
24 changes: 16 additions & 8 deletions pkg/manager/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ func (v *VirtletRuntimeService) RunPodSandbox(ctx context.Context, in *kubeapi.R
if status.State == kubeapi.PodSandboxState_SANDBOX_READY {
return &kubeapi.RunPodSandboxResponse{
PodSandboxId: podID,
}, err
}, nil
}
}

Expand All @@ -149,14 +149,20 @@ func (v *VirtletRuntimeService) RunPodSandbox(ctx context.Context, in *kubeapi.R
fdPayload := &tapmanager.GetFDPayload{Description: pnd}
csnBytes, err := v.fdManager.AddFDs(podID, fdPayload)
if err != nil {
// this will cause kubelet to delete the pod sandbox and then retry
// its creation
state = kubeapi.PodSandboxState_SANDBOX_NOTREADY
glog.Errorf("Error adding pod %s (%s) to CNI network: %v", podName, podID, err)
// Try to clean up CNI netns (this may be necessary e.g. in case of multiple CNI plugins with CNI Genie)
if fdErr := v.fdManager.ReleaseFDs(podID); err != nil {
glog.Errorf("Error removing pod %s (%s) from CNI network: %v", podName, podID, fdErr)
}

return nil, fmt.Errorf("Error adding pod %s (%s) to CNI network: %v", podName, podID, err)
}

psi, err := metadata.NewPodSandboxInfo(config, csnBytes, state, v.clock)
if err != nil {
// cleanup cni if we could not add pod to metadata store to prevent resource leaking
if fdErr := v.fdManager.ReleaseFDs(podID); err != nil {
glog.Errorf("Error removing pod %s (%s) from CNI network: %v", podName, podID, fdErr)
}
return nil, err
}

Expand All @@ -166,14 +172,16 @@ func (v *VirtletRuntimeService) RunPodSandbox(ctx context.Context, in *kubeapi.R
return psi, nil
},
); storeErr != nil {
// cleanup cni if we could not add pod to metadata store to prevent resource leaking
if err := v.fdManager.ReleaseFDs(podID); err != nil {
glog.Errorf("Error removing pod %s (%s) from CNI network: %v", podName, podID, err)
}
return nil, storeErr
}

// If we don't return PodSandboxId upon RunPodSandbox, kubelet will not retry
// RunPodSandbox for this pod after CNI failure
return &kubeapi.RunPodSandboxResponse{
PodSandboxId: podID,
}, err
}, nil
}

// StopPodSandbox implements StopPodSandbox method of CRI.
Expand Down
36 changes: 30 additions & 6 deletions pkg/tapmanager/tapfdsource.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,27 @@ func (s *TapFDSource) GetFDs(key string, data []byte) ([]int, []byte, error) {
return nil, nil, fmt.Errorf("error creating new netns for pod %s (%s): %v", pnd.PodName, pnd.PodID, err)
}

gotError := false
podAddedToNetwork := false
defer func() {
if gotError {
if podAddedToNetwork {
if err := s.cniClient.RemoveSandboxFromNetwork(pnd.PodID, pnd.PodName, pnd.PodNs); err != nil {
glog.Errorf("Error removing a pod from the pod network after failed network setup: %v", err)
}
}
if err := cni.DestroyNetNS(pnd.PodID); err != nil {
glog.Errorf("Error removing netns after failed network setup: %v", err)
}
}
}()

netConfig, err := s.cniClient.AddSandboxToNetwork(pnd.PodID, pnd.PodName, pnd.PodNs)
if err != nil {
gotError = true
return nil, nil, fmt.Errorf("error adding pod %s (%s) to CNI network: %v", pnd.PodName, pnd.PodID, err)
}
podAddedToNetwork = true
glog.V(3).Infof("CNI configuration for pod %s (%s): %s", pnd.PodName, pnd.PodID, spew.Sdump(netConfig))

if netConfig == nil {
Expand All @@ -152,6 +169,7 @@ func (s *TapFDSource) GetFDs(key string, data []byte) ([]int, []byte, error) {
var csn *network.ContainerSideNetwork
if err := s.setupNetNS(key, pnd, func(netNSPath string, allLinks []netlink.Link) (*network.ContainerSideNetwork, error) {
if netConfig, err = nettools.ValidateAndFixCNIResult(netConfig, netNSPath, allLinks); err != nil {
gotError = true
return nil, fmt.Errorf("error fixing cni configuration: %v", err)
}
if err := nettools.FixCalicoNetworking(netConfig, s.getDummyNetwork); err != nil {
Expand All @@ -174,14 +192,14 @@ func (s *TapFDSource) GetFDs(key string, data []byte) ([]int, []byte, error) {
}
return csn, nil
}); err != nil {
// TODO: do some cleanup on configured interfaces if there is an error
gotError = true
return nil, nil, err
}

for _, iface := range csn.Interfaces {
if iface.Type == network.InterfaceTypeVF {
if err := nettools.SetMacOnVf(iface.PCIAddress, iface.HardwareAddr); err != nil {
// TODO: do some cleanup on configured interfaces if there is an error
gotError = true
return nil, nil, err
}
}
Expand All @@ -206,6 +224,16 @@ func (s *TapFDSource) Release(key string) error {
return fmt.Errorf("failed to open network namespace at %q: %v", netNSPath, err)
}

// Try to keep this function idempotent even if there are errors during the following calls.
// This can cause some resource leaks in multiple CNI case but makes it possible
// to call `RunPodSandbox` again after a failed attempt. Failing to do so would cause
// the next `RunPodSandbox` call to fail due to the netns already being present.
defer func() {
if err := cni.DestroyNetNS(pn.pnd.PodID); err != nil {
glog.Errorf("Error when removing network namespace for pod sandbox %q: %v", pn.pnd.PodID, err)
}
}()

if err := nettools.ReconstructVFs(pn.csn, vmNS); err != nil {
return fmt.Errorf("failed to reconstruct SR-IOV devices: %v", err)
}
Expand All @@ -224,10 +252,6 @@ func (s *TapFDSource) Release(key string) error {
return fmt.Errorf("error removing pod sandbox %q from CNI network: %v", pn.pnd.PodID, err)
}

if err := cni.DestroyNetNS(pn.pnd.PodID); err != nil {
return fmt.Errorf("error when removing network namespace for pod sandbox %q: %v", pn.pnd.PodID, err)
}

delete(s.fdMap, key)
return nil
}
Expand Down

0 comments on commit 5d34e2a

Please sign in to comment.