Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge https://github.com/k8snetworkplumbingwg/sriov-network-operator:master into master #1037

Merged
merged 11 commits into from
Nov 29, 2024
Merged
22 changes: 11 additions & 11 deletions api/v1/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,23 +261,23 @@ func NeedToUpdateSriov(ifaceSpec *Interface, ifaceStatus *InterfaceExt) bool {
if ifaceSpec.Mtu > 0 {
mtu := ifaceSpec.Mtu
if mtu > ifaceStatus.Mtu {
log.V(2).Info("NeedToUpdateSriov(): MTU needs update", "desired", mtu, "current", ifaceStatus.Mtu)
log.V(0).Info("NeedToUpdateSriov(): MTU needs update", "desired", mtu, "current", ifaceStatus.Mtu)
return true
}
}
currentEswitchMode := GetEswitchModeFromStatus(ifaceStatus)
desiredEswitchMode := GetEswitchModeFromSpec(ifaceSpec)
if currentEswitchMode != desiredEswitchMode {
log.V(2).Info("NeedToUpdateSriov(): EswitchMode needs update", "desired", desiredEswitchMode, "current", currentEswitchMode)
log.V(0).Info("NeedToUpdateSriov(): EswitchMode needs update", "desired", desiredEswitchMode, "current", currentEswitchMode)
return true
}
if ifaceSpec.NumVfs != ifaceStatus.NumVfs {
log.V(2).Info("NeedToUpdateSriov(): NumVfs needs update", "desired", ifaceSpec.NumVfs, "current", ifaceStatus.NumVfs)
log.V(0).Info("NeedToUpdateSriov(): NumVfs needs update", "desired", ifaceSpec.NumVfs, "current", ifaceStatus.NumVfs)
return true
}

if ifaceStatus.LinkAdminState == consts.LinkAdminStateDown {
log.V(2).Info("NeedToUpdateSriov(): PF link status needs update", "desired to include", "up", "current", ifaceStatus.LinkAdminState)
log.V(0).Info("NeedToUpdateSriov(): PF link status needs update", "desired to include", "up", "current", ifaceStatus.LinkAdminState)
return true
}

Expand All @@ -286,24 +286,24 @@ func NeedToUpdateSriov(ifaceSpec *Interface, ifaceStatus *InterfaceExt) bool {
for _, groupSpec := range ifaceSpec.VfGroups {
if IndexInRange(vfStatus.VfID, groupSpec.VfRange) {
if vfStatus.Driver == "" {
log.V(2).Info("NeedToUpdateSriov(): Driver needs update - has no driver",
log.V(0).Info("NeedToUpdateSriov(): Driver needs update - has no driver",
"desired", groupSpec.DeviceType)
return true
}
if groupSpec.DeviceType != "" && groupSpec.DeviceType != consts.DeviceTypeNetDevice {
if groupSpec.DeviceType != vfStatus.Driver {
log.V(2).Info("NeedToUpdateSriov(): Driver needs update",
log.V(0).Info("NeedToUpdateSriov(): Driver needs update",
"desired", groupSpec.DeviceType, "current", vfStatus.Driver)
return true
}
} else {
if StringInArray(vfStatus.Driver, vars.DpdkDrivers) {
log.V(2).Info("NeedToUpdateSriov(): Driver needs update",
log.V(0).Info("NeedToUpdateSriov(): Driver needs update",
"desired", groupSpec.DeviceType, "current", vfStatus.Driver)
return true
}
if vfStatus.Mtu != 0 && groupSpec.Mtu != 0 && vfStatus.Mtu != groupSpec.Mtu {
log.V(2).Info("NeedToUpdateSriov(): VF MTU needs update",
log.V(0).Info("NeedToUpdateSriov(): VF MTU needs update",
"vf", vfStatus.VfID, "desired", groupSpec.Mtu, "current", vfStatus.Mtu)
return true
}
Expand All @@ -313,20 +313,20 @@ func NeedToUpdateSriov(ifaceSpec *Interface, ifaceStatus *InterfaceExt) bool {
// Node GUID. We intentionally skip empty Node GUID in vfStatus because this may happen
// when the VF is allocated to a workload.
if vfStatus.GUID == consts.UninitializedNodeGUID {
log.V(2).Info("NeedToUpdateSriov(): VF GUID needs update",
log.V(0).Info("NeedToUpdateSriov(): VF GUID needs update",
"vf", vfStatus.VfID, "current", vfStatus.GUID)
return true
}
}
// this is needed to be sure the admin mac address is configured as expected
if ifaceSpec.ExternallyManaged {
log.V(2).Info("NeedToUpdateSriov(): need to update the device as it's externally manage",
log.V(0).Info("NeedToUpdateSriov(): need to update the device as it's externally manage",
"device", ifaceStatus.PciAddress)
return true
}
}
if groupSpec.VdpaType != vfStatus.VdpaType {
log.V(2).Info("NeedToUpdateSriov(): VF VdpaType mismatch",
log.V(0).Info("NeedToUpdateSriov(): VF VdpaType mismatch",
"desired", groupSpec.VdpaType, "current", vfStatus.VdpaType)
return true
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ metadata:
categories: Networking
certified: "false"
containerImage: quay.io/openshift/origin-sriov-network-operator:4.18
createdAt: "2024-11-21T23:48:37Z"
createdAt: "2024-11-28T23:49:58Z"
description: An operator for configuring SR-IOV components and initializing SRIOV
network devices in Openshift cluster.
features.operators.openshift.io/cnf: "false"
Expand Down
57 changes: 57 additions & 0 deletions cmd/sriov-network-config-daemon/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"os"
"time"

"github.com/go-logr/logr"
"github.com/spf13/cobra"
Expand All @@ -40,6 +41,12 @@ import (
const (
PhasePre = "pre"
PhasePost = "post"

// InitializationDeviceDiscoveryTimeoutSec constant defines the number of
// seconds to wait for devices to be registered in the system with the expected name.
InitializationDeviceDiscoveryTimeoutSec = 60
// InitializationDeviceUdevProcessingTimeoutSec constant defines the number of seconds to wait for udev rules to process
InitializationDeviceUdevProcessingTimeoutSec = 60
)

var (
Expand Down Expand Up @@ -104,6 +111,8 @@ func runServiceCmd(cmd *cobra.Command, args []string) error {
return updateSriovResultErr(setupLog, phaseArg, fmt.Errorf("failed to create hostHelpers: %v", err))
}

waitForDevicesInitialization(setupLog, sriovConf, hostHelpers)

if phaseArg == PhasePre {
err = phasePre(setupLog, sriovConf, hostHelpers)
} else {
Expand Down Expand Up @@ -303,3 +312,51 @@ func updateResult(setupLog logr.Logger, result, msg string) error {
setupLog.V(0).Info("result file updated", "SyncStatus", sriovResult.SyncStatus, "LastSyncError", msg)
return nil
}

// waitForDevicesInitialization should be executed in both the pre and post-networking stages.
// This function ensures that the network devices specified in the configuration are registered
// and handled by UDEV. Sometimes, the initialization of network devices might take a significant
// amount of time, and the sriov-config systemd service may start before the devices are fully
// processed, leading to failure.
//
// To address this, we not only check if the devices are registered with the correct name but also
// wait for the udev event queue to empty. This increases the likelihood that the service will start
// only when the devices are fully initialized. It is required to call this function in the
// "post-networking" phase as well because the OS network manager might change device configurations,
// and we need to ensure these changes are fully processed before starting the post-networking part.
//
// The timeouts used in this function are intentionally kept low to avoid blocking the OS loading
// process for too long in case of any issues.
//
// Note: Currently, this function handles only Baremetal clusters. We do not have evidence that
// this logic is required for virtual clusters.
func waitForDevicesInitialization(setupLog logr.Logger, conf *systemd.SriovConfig, hostHelpers helper.HostHelpersInterface) {
if conf.PlatformType != consts.Baremetal {
// skip waiting on virtual cluster
return
}
// wait for devices from the spec to be registered in the system with expected names
devicesToWait := make(map[string]string, len(conf.Spec.Interfaces))
for _, d := range conf.Spec.Interfaces {
devicesToWait[d.PciAddress] = d.Name
}
deadline := time.Now().Add(time.Second * time.Duration(InitializationDeviceDiscoveryTimeoutSec))
for time.Now().Before(deadline) {
for pciAddr, name := range devicesToWait {
if hostHelpers.TryGetInterfaceName(pciAddr) == name {
delete(devicesToWait, pciAddr)
}
}
if len(devicesToWait) == 0 {
break
}
time.Sleep(time.Second)
}
if len(devicesToWait) != 0 {
setupLog.Info("WARNING: some devices were not initialized", "devices", devicesToWait, "timeout", InitializationDeviceDiscoveryTimeoutSec)
}
if err := hostHelpers.WaitUdevEventsProcessed(InitializationDeviceUdevProcessingTimeoutSec); err != nil {
setupLog.Info("WARNING: failed to wait for udev events processing", "reason", err.Error(),
"timeout", InitializationDeviceUdevProcessingTimeoutSec)
}
}
20 changes: 20 additions & 0 deletions cmd/sriov-network-config-daemon/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"fmt"

"github.com/go-logr/logr"
"github.com/golang/mock/gomock"
"github.com/spf13/cobra"
"gopkg.in/yaml.v3"
Expand Down Expand Up @@ -158,6 +159,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil)
hostHelpers.EXPECT().TryEnableTun().Return()
hostHelpers.EXPECT().TryEnableVhostNet().Return()
Expand Down Expand Up @@ -211,6 +214,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil)
hostHelpers.EXPECT().TryEnableTun().Return()
hostHelpers.EXPECT().TryEnableVhostNet().Return()
Expand All @@ -236,6 +241,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": getTestResultFileContent("InProgress", ""),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().DiscoverSriovDevices(hostHelpers).Return([]sriovnetworkv1.InterfaceExt{{
Name: "enp216s0f0np0",
}}, nil)
Expand Down Expand Up @@ -276,4 +283,17 @@ var _ = Describe("Service", func() {
testHelpers.GinkgoAssertFileContentsEquals("/etc/sriov-operator/sriov-interface-result.yaml",
string(getTestResultFileContent("Failed", "post: unexpected result of the pre phase: Failed, syncError: pretest")))
})
It("waitForDevicesInitialization", func() {
cfg := &systemd.SriovConfig{Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{
Interfaces: []sriovnetworkv1.Interface{
{Name: "name1", PciAddress: "0000:d8:00.0"},
{Name: "name2", PciAddress: "0000:d8:00.1"}}}}
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("other")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("name1")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("name2")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
waitForDevicesInitialization(logr.Discard(), cfg, hostHelpers)
})
})
Loading