Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Verify status changes on managed interfaces #530

Merged
merged 4 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 98 additions & 69 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,77 +313,60 @@ func (dn *Daemon) nodeStateSyncHandler() error {
latest := dn.desiredNodeState.GetGeneration()
log.Log.V(0).Info("nodeStateSyncHandler(): new generation", "generation", latest)

if dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() {
if vars.UsingSystemdMode {
serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host")
return err
}
postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host")
return err
}

// if the service doesn't exist we should continue to let the k8s plugin to create the service files
// this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
// the system service and reboot the node the config-daemon doesn't need to do anything.
if !(serviceEnabled && postNetworkServiceEnabled) {
sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed,
LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
"sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
} else {
sriovResult, err = systemd.ReadSriovResult()
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host")
return err
}
}
if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed {
log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError)

// add the error but don't requeue
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusFailed,
lastSyncError: sriovResult.LastSyncError,
}
<-dn.syncCh
return nil
}
}
log.Log.V(0).Info("nodeStateSyncHandler(): Interface not changed")
if dn.desiredNodeState.Status.LastSyncError != "" ||
dn.desiredNodeState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh the status
<-dn.syncCh
// load plugins if it has not loaded
if len(dn.loadedPlugins) == 0 {
dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins")
return err
}

return nil
}

if dn.desiredNodeState.GetGeneration() == 1 && len(dn.desiredNodeState.Spec.Interfaces) == 0 {
err = dn.HostHelpers.ClearPCIAddressFolder()
if vars.UsingSystemdMode && dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() {
serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath)
if err != nil {
log.Log.Error(err, "failed to clear the PCI address configuration")
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host")
return err
}
postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host")
return err
}

log.Log.V(0).Info(
"nodeStateSyncHandler(): interface policy spec not yet set by controller for sriovNetworkNodeState",
"name", dn.desiredNodeState.Name)
if dn.desiredNodeState.Status.SyncStatus != "Succeeded" {
// if the service doesn't exist we should continue to let the k8s plugin to create the service files
// this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply
// the system service and reboot the node the config-daemon doesn't need to do anything.
if !(serviceEnabled && postNetworkServiceEnabled) {
sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed,
LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+
"sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)}
} else {
sriovResult, err = systemd.ReadSriovResult()
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host")
return err
}
}
if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed {
log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError)

// add the error but don't requeue
dn.refreshCh <- Message{
syncStatus: "Succeeded",
lastSyncError: "",
syncStatus: consts.SyncStatusFailed,
lastSyncError: sriovResult.LastSyncError,
}
// wait for writer to refresh status
<-dn.syncCh
return nil
}
}

skip, err := dn.shouldSkipReconciliation(dn.desiredNodeState)
if err != nil {
return err
}
// Reconcile only when there are changes in the spec or status of managed VFs.
if skip {
return nil
}

Expand All @@ -404,15 +387,6 @@ func (dn *Daemon) nodeStateSyncHandler() error {
}
dn.desiredNodeState.Status = updatedState.Status

// load plugins if it has not loaded
if len(dn.loadedPlugins) == 0 {
dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins)
if err != nil {
log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins")
return err
}
}

reqReboot := false
reqDrain := false

Expand Down Expand Up @@ -564,6 +538,61 @@ func (dn *Daemon) nodeStateSyncHandler() error {
return nil
}

func (dn *Daemon) shouldSkipReconciliation(latestState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
mlguerrero12 marked this conversation as resolved.
Show resolved Hide resolved
var err error

// Skip when SriovNetworkNodeState object has just been created.
if latestState.GetGeneration() == 1 && len(latestState.Spec.Interfaces) == 0 {
err = dn.HostHelpers.ClearPCIAddressFolder()
if err != nil {
log.Log.Error(err, "failed to clear the PCI address configuration")
return false, err
}

log.Log.V(0).Info(
"shouldSkipReconciliation(): interface policy spec not yet set by controller for sriovNetworkNodeState",
"name", latestState.Name)
if latestState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh status
<-dn.syncCh
}
return true, nil
}

// Verify changes in the spec of the SriovNetworkNodeState CR.
if dn.currentNodeState.GetGeneration() == latestState.GetGeneration() && !dn.isDrainCompleted() {
for _, p := range dn.loadedPlugins {
// Verify changes in the status of the SriovNetworkNodeState CR.
changed, err := p.CheckStatusChanges(latestState)
mlguerrero12 marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return false, err
}
if changed {
return false, nil
}
}

log.Log.V(0).Info("shouldSkipReconciliation(): Interface not changed")
if latestState.Status.LastSyncError != "" ||
latestState.Status.SyncStatus != consts.SyncStatusSucceeded {
dn.refreshCh <- Message{
syncStatus: consts.SyncStatusSucceeded,
lastSyncError: "",
}
// wait for writer to refresh the status
<-dn.syncCh
}

return true, nil
}

return false, nil
}

// handleDrain: adds the right annotation to the node and nodeState object
// returns true if we need to finish the reconcile loop and wait for a new object
func (dn *Daemon) handleDrain(reqReboot bool) (bool, error) {
Expand Down
4 changes: 4 additions & 0 deletions pkg/plugins/fake/fake_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ func (f *FakePlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState
return false, false, nil
}

func (f *FakePlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
return false, nil
}

func (f *FakePlugin) Apply() error {
return nil
}
128 changes: 84 additions & 44 deletions pkg/plugins/generic/generic_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bytes"
"errors"
"os/exec"
"reflect"
"strconv"
"strings"
"syscall"
Expand Down Expand Up @@ -52,7 +51,6 @@ type GenericPlugin struct {
PluginName string
SpecVersion string
DesireState *sriovnetworkv1.SriovNetworkNodeState
LastState *sriovnetworkv1.SriovNetworkNodeState
DriverStateMap DriverStateMapType
DesiredKernelArgs map[string]bool
helpers helper.HostHelpersInterface
Expand Down Expand Up @@ -141,6 +139,37 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt
return
}

// CheckStatusChanges verify whether SriovNetworkNodeState CR status present changes on configured VFs.
func (p *GenericPlugin) CheckStatusChanges(current *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
log.Log.Info("generic-plugin CheckStatusChanges()")

for _, iface := range current.Spec.Interfaces {
found := false
for _, ifaceStatus := range current.Status.Interfaces {
// TODO: remove the check for ExternallyManaged - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/632
if iface.PciAddress == ifaceStatus.PciAddress && !iface.ExternallyManaged {
mlguerrero12 marked this conversation as resolved.
Show resolved Hide resolved
found = true
if sriovnetworkv1.NeedToUpdateSriov(&iface, &ifaceStatus) {
log.Log.Info("CheckStatusChanges(): status changed for interface", "address", iface.PciAddress)
return true, nil
}
break
}
}
if !found {
log.Log.Info("CheckStatusChanges(): no status found for interface", "address", iface.PciAddress)
mlguerrero12 marked this conversation as resolved.
Show resolved Hide resolved
mlguerrero12 marked this conversation as resolved.
Show resolved Hide resolved
}
}

missingKernelArgs, err := p.getMissingKernelArgs()
if err != nil {
log.Log.Error(err, "generic-plugin CheckStatusChanges(): failed to verify missing kernel arguments")
return false, err
}

return len(missingKernelArgs) != 0, nil
}

func (p *GenericPlugin) syncDriverState() error {
for _, driverState := range p.DriverStateMap {
if !driverState.DriverLoaded && driverState.NeedDriverFunc(p.DesireState, driverState) {
Expand All @@ -159,14 +188,6 @@ func (p *GenericPlugin) syncDriverState() error {
func (p *GenericPlugin) Apply() error {
log.Log.Info("generic plugin Apply()", "desiredState", p.DesireState.Spec)

if p.LastState != nil {
log.Log.Info("generic plugin Apply()", "lastState", p.LastState.Spec)
if reflect.DeepEqual(p.LastState.Spec.Interfaces, p.DesireState.Spec.Interfaces) {
log.Log.Info("generic plugin Apply(): desired and latest state are the same, nothing to apply")
return nil
}
}

if err := p.syncDriverState(); err != nil {
return err
}
Expand All @@ -188,8 +209,7 @@ func (p *GenericPlugin) Apply() error {
}
return err
}
p.LastState = &sriovnetworkv1.SriovNetworkNodeState{}
*p.LastState = *p.DesireState

return nil
}

Expand Down Expand Up @@ -252,38 +272,49 @@ func (p *GenericPlugin) addToDesiredKernelArgs(karg string) {
}
}

// syncDesiredKernelArgs Should be called to set all the kernel arguments. Returns bool if node update is needed.
func (p *GenericPlugin) syncDesiredKernelArgs() (bool, error) {
needReboot := false
// getMissingKernelArgs gets Kernel arguments that have not been set.
func (p *GenericPlugin) getMissingKernelArgs() ([]string, error) {
missingArgs := make([]string, 0, len(p.DesiredKernelArgs))
if len(p.DesiredKernelArgs) == 0 {
return false, nil
return nil, nil
}

kargs, err := p.helpers.GetCurrentKernelArgs()
if err != nil {
return false, err
return nil, err
}
for desiredKarg, attempted := range p.DesiredKernelArgs {
set := p.helpers.IsKernelArgsSet(kargs, desiredKarg)
if !set {
if attempted {
log.Log.V(2).Info("generic plugin syncDesiredKernelArgs(): previously attempted to set kernel arg",
"karg", desiredKarg)
}
// There is a case when we try to set the kernel argument here, the daemon could decide to not reboot because
// the daemon encountered a potentially one-time error. However we always want to make sure that the kernel
// argument is set once the daemon goes through node state sync again.
update, err := setKernelArg(desiredKarg)
if err != nil {
log.Log.Error(err, "generic plugin syncDesiredKernelArgs(): fail to set kernel arg", "karg", desiredKarg)
return false, err
}
if update {
needReboot = true
log.Log.V(2).Info("generic plugin syncDesiredKernelArgs(): need reboot for setting kernel arg", "karg", desiredKarg)
}
p.DesiredKernelArgs[desiredKarg] = true

for desiredKarg := range p.DesiredKernelArgs {
if !p.helpers.IsKernelArgsSet(kargs, desiredKarg) {
missingArgs = append(missingArgs, desiredKarg)
}
}
return missingArgs, nil
}

// syncDesiredKernelArgs should be called to set all the kernel arguments. Returns bool if node update is needed.
func (p *GenericPlugin) syncDesiredKernelArgs(kargs []string) (bool, error) {
needReboot := false

for _, karg := range kargs {
if p.DesiredKernelArgs[karg] {
log.Log.V(2).Info("generic-plugin syncDesiredKernelArgs(): previously attempted to set kernel arg",
"karg", karg)
}
// There is a case when we try to set the kernel argument here, the daemon could decide to not reboot because
// the daemon encountered a potentially one-time error. However we always want to make sure that the kernel
// argument is set once the daemon goes through node state sync again.
update, err := setKernelArg(karg)
if err != nil {
log.Log.Error(err, "generic-plugin syncDesiredKernelArgs(): fail to set kernel arg", "karg", karg)
return false, err
}
if update {
needReboot = true
log.Log.V(2).Info("generic-plugin syncDesiredKernelArgs(): need reboot for setting kernel arg", "karg", karg)
}
p.DesiredKernelArgs[karg] = true
}
return needReboot, nil
}

Expand Down Expand Up @@ -351,19 +382,28 @@ func (p *GenericPlugin) addVfioDesiredKernelArg(state *sriovnetworkv1.SriovNetwo
}
}

func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (needReboot bool, err error) {
needReboot = false
func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (bool, error) {
needReboot := false

p.addVfioDesiredKernelArg(state)

updateNode, err := p.syncDesiredKernelArgs()
missingKernelArgs, err := p.getMissingKernelArgs()
if err != nil {
log.Log.Error(err, "generic plugin needRebootNode(): failed to set the desired kernel arguments")
log.Log.Error(err, "generic-plugin needRebootNode(): failed to verify missing kernel arguments")
return false, err
}
if updateNode {
log.Log.V(2).Info("generic plugin needRebootNode(): need reboot for updating kernel arguments")
needReboot = true

if len(missingKernelArgs) != 0 {
needReboot, err = p.syncDesiredKernelArgs(missingKernelArgs)
if err != nil {
log.Log.Error(err, "generic-plugin needRebootNode(): failed to set the desired kernel arguments")
return false, err
}
if needReboot {
log.Log.V(2).Info("generic-plugin needRebootNode(): need reboot for updating kernel arguments")
}
}

return needReboot, nil
}

Expand Down
Loading
Loading