Skip to content

Commit 6065e51

Browse files
committed
EgressIP: fix failover and restart stale SNAT/LRP; ensure status sync
Scenario: - Nodes: node-1, node-2, node-3 - Egress IPs: EIP-1 - Pods: pod1 on node-1, pod2 on node-3 (pods are created via deployment replicas) - Egress-assignable nodes: node-1, node-2 - EIP-1 assigned to node-1 During a simultaneous reboot of node-1 and node-2, EIP-1 failed over to node-2 and ovnkube-controller restarted at nearly the same time: 1) EIP-1 was reassigned to node-2 by the cluster manager. 2) The sync EIP happened for EIP1 with stale status, though it cleaned SNATs/LRPs referring to node-1 due to outdated pod IPs (this is because pods will be recreated due to node reboots). 3) pod1/pod2 Add events arrived while the informer cache still had the old EIP status, so new SNATs/LRPs were created pointing to node-1. 4) The EIP-1 Add event arrived with the new status; entries for node-2 were added/updated. 5) Result: stale SNATs and LRPs with stale nexthops for node-1 remained. Fix: - Populate pod EIP status during EgressIP sync so podAssignment has accurate egressStatuses. - Reconcile stale assignments using podAssignment (egressStatuses) when the informer cache is not up to date, ensuring SNAT/LRP for the previously assigned node are corrected. - Remove stale EIP SNAT entries for remote-zone pods accordingly. - Add coverage for simultaneous EIP failover and controller restart. Signed-off-by: Periyasamy Palanisamy <pepalani@redhat.com> (cherry picked from commit 1667a51)
1 parent 33e0c3e commit 6065e51

File tree

2 files changed

+435
-35
lines changed

2 files changed

+435
-35
lines changed

go-controller/pkg/ovn/egressip.go

Lines changed: 74 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -788,25 +788,7 @@ func (e *EgressIPController) addPodEgressIPAssignments(ni util.NetInfo, name str
788788
if len(statusAssignments) == 0 {
789789
return nil
790790
}
791-
// We need to proceed with add only under two conditions
792-
// 1) egressNode present in at least one status is local to this zone
793-
// (NOTE: The relation between egressIPName and nodeName is 1:1 i.e in the same object the given node will be present only in one status)
794-
// 2) the pod being added is local to this zone
795-
proceed := false
796-
for _, status := range statusAssignments {
797-
e.nodeZoneState.LockKey(status.Node)
798-
isLocalZoneEgressNode, loadedEgressNode := e.nodeZoneState.Load(status.Node)
799-
if loadedEgressNode && isLocalZoneEgressNode {
800-
proceed = true
801-
e.nodeZoneState.UnlockKey(status.Node)
802-
break
803-
}
804-
e.nodeZoneState.UnlockKey(status.Node)
805-
}
806-
if !proceed && !e.isPodScheduledinLocalZone(pod) {
807-
return nil // nothing to do if none of the status nodes are local to this master and pod is also remote
808-
}
809-
var remainingAssignments []egressipv1.EgressIPStatusItem
791+
var remainingAssignments, staleAssignments []egressipv1.EgressIPStatusItem
810792
nadName := ni.GetNetworkName()
811793
if ni.IsSecondary() {
812794
nadNames := ni.GetNADs()
@@ -842,9 +824,16 @@ func (e *EgressIPController) addPodEgressIPAssignments(ni util.NetInfo, name str
842824
// podState.egressIPName can be empty if no re-routes were found in
843825
// syncPodAssignmentCache for the existing pod, we will treat this case as a new add
844826
for _, status := range statusAssignments {
845-
if exists := podState.egressStatuses.contains(status); !exists {
827+
// Add the status if it's not already in the cache, or if it exists but is in pending state
828+
// (meaning it was populated during EIP sync and needs to be processed for the pod).
829+
if value, exists := podState.egressStatuses.statusMap[status]; !exists || value == egressStatusStatePending {
846830
remainingAssignments = append(remainingAssignments, status)
847831
}
832+
// Detect stale EIP status entries (same EgressIP reassigned to a different node)
833+
// and queue the outdated entry for cleanup.
834+
if staleStatus := podState.egressStatuses.hasStaleEIPStatus(status); staleStatus != nil {
835+
staleAssignments = append(staleAssignments, *staleStatus)
836+
}
848837
}
849838
podState.podIPs = podIPs
850839
podState.egressIPName = name
@@ -866,6 +855,32 @@ func (e *EgressIPController) addPodEgressIPAssignments(ni util.NetInfo, name str
866855
podState.standbyEgressIPNames.Insert(name)
867856
return nil
868857
}
858+
for _, staleStatus := range staleAssignments {
859+
klog.V(2).Infof("Deleting stale pod egress IP status: %v for EgressIP: %s and pod: %s/%s/%v", staleStatus, name, pod.Namespace, pod.Name, podIPNets)
860+
err = e.deletePodEgressIPAssignments(ni, name, []egressipv1.EgressIPStatusItem{staleStatus}, pod)
861+
if err != nil {
862+
klog.Warningf("Failed to delete stale EgressIP status %s/%v for pod %s: %v", name, staleStatus, podKey, err)
863+
}
864+
delete(podState.egressStatuses.statusMap, staleStatus)
865+
}
866+
// We need to proceed with add only under two conditions
867+
// 1) egressNode present in at least one status is local to this zone
868+
// (NOTE: The relation between egressIPName and nodeName is 1:1 i.e in the same object the given node will be present only in one status)
869+
// 2) the pod being added is local to this zone
870+
proceed := false
871+
for _, status := range statusAssignments {
872+
e.nodeZoneState.LockKey(status.Node)
873+
isLocalZoneEgressNode, loadedEgressNode := e.nodeZoneState.Load(status.Node)
874+
if loadedEgressNode && isLocalZoneEgressNode {
875+
proceed = true
876+
e.nodeZoneState.UnlockKey(status.Node)
877+
break
878+
}
879+
e.nodeZoneState.UnlockKey(status.Node)
880+
}
881+
if !proceed && !e.isPodScheduledinLocalZone(pod) {
882+
return nil // nothing to do if none of the status nodes are local to this master and pod is also remote
883+
}
869884
for _, status := range remainingAssignments {
870885
klog.V(2).Infof("Adding pod egress IP status: %v for EgressIP: %s and pod: %s/%s/%v", status, name, pod.Namespace, pod.Name, podIPNets)
871886
nodesToLock := []string{status.Node, pod.Spec.NodeName}
@@ -1155,6 +1170,8 @@ type egressIPCache struct {
11551170
egressLocalNodesCache sets.Set[string]
11561171
// egressIP IP -> assigned node name
11571172
egressIPIPToNodeCache map[string]string
1173+
// egressIP name -> egress IP -> assigned node name
1174+
egressIPToAssignedNodes map[string]map[string]string
11581175
// node name -> network name -> redirect IPs
11591176
egressNodeRedirectsCache nodeNetworkRedirects
11601177
// network name -> OVN cluster router name
@@ -1594,6 +1611,14 @@ func (e *EgressIPController) syncPodAssignmentCache(egressIPCache egressIPCache)
15941611
}
15951612
}
15961613

1614+
// populate podState.egressStatuses with assigned node for active egressIP IPs.
1615+
if podState.egressIPName == egressIPName {
1616+
for egressIPIP, nodeName := range egressIPCache.egressIPToAssignedNodes[egressIPName] {
1617+
podState.egressStatuses.statusMap[egressipv1.EgressIPStatusItem{
1618+
EgressIP: egressIPIP, Node: nodeName}] = egressStatusStatePending
1619+
}
1620+
}
1621+
15971622
e.podAssignment.Store(podKey, podState)
15981623
return nil
15991624
}); err != nil {
@@ -1975,6 +2000,9 @@ func (e *EgressIPController) generateCacheForEgressIP() (egressIPCache, error) {
19752000
// egressIP IP -> node name. Assigned node for EIP.
19762001
egressIPIPNodeCache := make(map[string]string, 0)
19772002
cache.egressIPIPToNodeCache = egressIPIPNodeCache
2003+
// egressIP name -> egressIP IP -> node name.
2004+
egressIPToAssignedNodes := make(map[string]map[string]string, 0)
2005+
cache.egressIPToAssignedNodes = egressIPToAssignedNodes
19782006
cache.markCache = make(map[string]string)
19792007
egressIPs, err := e.watchFactory.GetEgressIPs()
19802008
if err != nil {
@@ -1988,13 +2016,15 @@ func (e *EgressIPController) generateCacheForEgressIP() (egressIPCache, error) {
19882016
cache.markCache[egressIP.Name] = mark.String()
19892017
egressIPsCache[egressIP.Name] = make(map[string]selectedPods, 0)
19902018
egressIPNameNodesCache[egressIP.Name] = make([]string, 0, len(egressIP.Status.Items))
2019+
egressIPToAssignedNodes[egressIP.Name] = make(map[string]string, 0)
19912020
for _, status := range egressIP.Status.Items {
19922021
eipIP := net.ParseIP(status.EgressIP)
19932022
if eipIP == nil {
19942023
klog.Errorf("Failed to parse EgressIP %s IP %q from status", egressIP.Name, status.EgressIP)
19952024
continue
19962025
}
19972026
egressIPIPNodeCache[eipIP.String()] = status.Node
2027+
egressIPToAssignedNodes[egressIP.Name][eipIP.String()] = status.Node
19982028
if localZoneNodes.Has(status.Node) {
19992029
egressLocalNodesCache.Insert(status.Node)
20002030
}
@@ -2251,9 +2281,18 @@ func InitClusterEgressPolicies(nbClient libovsdbclient.Client, addressSetFactory
22512281
return nil
22522282
}
22532283

2284+
// egressStatusStatePending marks entries populated during EIP sync and
2285+
// indicates they must be reconciled again for the pod.
2286+
const egressStatusStatePending = "pending"
2287+
22542288
type statusMap map[egressipv1.EgressIPStatusItem]string
22552289

22562290
type egressStatuses struct {
2291+
// statusMap tracks per EIP status assignment for a pod.
2292+
// Key: egressipv1.EgressIPStatusItem {EgressIP, Node}
2293+
// Values:
2294+
// "" -> applied/reconciled
2295+
// egressStatusStatePending -> populated during EIP sync, pending reconcile.
22572296
statusMap
22582297
}
22592298

@@ -2265,6 +2304,21 @@ func (e egressStatuses) contains(potentialStatus egressipv1.EgressIPStatusItem)
22652304
return false
22662305
}
22672306

2307+
// hasStaleEIPStatus checks for stale EIP status entries already in cache.
2308+
// This addresses the race condition where an EIP is reassigned to a different node
2309+
// but the cache still contains the old assignment, leading to stale SNAT/LRP entries.
2310+
func (e egressStatuses) hasStaleEIPStatus(potentialStatus egressipv1.EgressIPStatusItem) *egressipv1.EgressIPStatusItem {
2311+
var staleStatus *egressipv1.EgressIPStatusItem
2312+
for status := range e.statusMap {
2313+
if status.EgressIP == potentialStatus.EgressIP &&
2314+
status.Node != potentialStatus.Node {
2315+
staleStatus = &egressipv1.EgressIPStatusItem{EgressIP: status.EgressIP, Node: status.Node}
2316+
break
2317+
}
2318+
}
2319+
return staleStatus
2320+
}
2321+
22682322
func (e egressStatuses) delete(deleteStatus egressipv1.EgressIPStatusItem) {
22692323
delete(e.statusMap, deleteStatus)
22702324
}

0 commit comments

Comments
 (0)