From d5aad8f8d244f137e5d63b4d41dbdcc91dd49126 Mon Sep 17 00:00:00 2001 From: Adam Harrison Date: Wed, 16 Dec 2015 11:51:28 +0000 Subject: [PATCH] Introduce limited retry on vxlan vport creation The kernel processes vport deletion asynchronously, meaning that we can sometimes fail to recreate vxlan vports on startup with an EADDRINUSE error. We compensate by retrying a small number of times with a short sleep on encountering this error, failing as before on a different error or exhaustion of retries. --- router/fastdp.go | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/router/fastdp.go b/router/fastdp.go index 0fee520533..c23f1a38bf 100644 --- a/router/fastdp.go +++ b/router/fastdp.go @@ -6,6 +6,7 @@ import ( "fmt" "net" "sync" + "syscall" "time" "github.com/weaveworks/go-odp/odp" @@ -110,6 +111,10 @@ func NewFastDatapath(config FastDatapathConfig) (*FastDatapath, error) { forwarders: make(map[mesh.PeerName]*fastDatapathForwarder), } + // This delete happens asynchronously in the kernel, meaning that + // we can sometimes fail to recreate the vxlan vport with EADDRINUSE - + // consequently we retry a small number of times in + // getVxlanVportIDHarder() to compensate. if err := fastdp.deleteVxlanVports(); err != nil { return nil, err } @@ -126,7 +131,7 @@ func NewFastDatapath(config FastDatapathConfig) (*FastDatapath, error) { // numbers to be independent, but working out how to specify // them on the connecting side. So we can wait to find out if // anyone wants that. - fastdp.mainVxlanVportID, err = fastdp.getVxlanVportID(config.Port + 1) + fastdp.mainVxlanVportID, err = fastdp.getVxlanVportIDHarder(config.Port+1, 5, time.Millisecond*10) if err != nil { return nil, err } @@ -406,6 +411,20 @@ func (fastdp fastDatapathOverlay) StartConsumingPackets(localPeer *mesh.Peer, pe return nil } +func (fastdp *FastDatapath) getVxlanVportIDHarder(udpPort int, retries int, duration time.Duration) (odp.VportID, error) { + var vxlanVportID odp.VportID + var err error + for try := 0; try < retries; try++ { + vxlanVportID, err = fastdp.getVxlanVportID(udpPort) + if err == nil || err != odp.NetlinkError(syscall.EADDRINUSE) { + return vxlanVportID, err + } + log.Warning("Address already in use creating vxlan vport ", udpPort, " - retrying") + time.Sleep(duration) + } + return 0, err +} + func (fastdp *FastDatapath) getVxlanVportID(udpPort int) (odp.VportID, error) { fastdp.lock.Lock() defer fastdp.lock.Unlock()