Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable automated Prune via CRD NodeFeatureDiscoveries #188

Merged
merged 1 commit into from
Mar 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions api/v1/nodefeaturediscovery_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ type NodeFeatureDiscoverySpec struct {
// worker.
// +optional
WorkerConfig ConfigMap `json:"workerConfig"`

// PruneOnDelete defines whether the NFD-master prune should be
// enabled or not. If enabled, the Operator will deploy an NFD-Master prune
// job that will remove all NFD labels (and other NFD-managed assets such
// as annotations, extended resources and taints) from the cluster nodes.
// +optional
PruneOnDelete bool `json:"prunerOnDelete"`
}

// OperandSpec describes configuration options for the operand
Expand Down
4 changes: 4 additions & 0 deletions build/assets/prune/0100_service_account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: nfd-prune
14 changes: 14 additions & 0 deletions build/assets/prune/0200_clusterrole.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nfd-prune
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- patch
- update
- list
13 changes: 13 additions & 0 deletions build/assets/prune/0300_clusterole_binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nfd-prune
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nfd-prune
subjects:
- kind: ServiceAccount
name: nfd-prune
namespace: node-feature-discovery-operator

61 changes: 61 additions & 0 deletions build/assets/prune/0400_prune_job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: batch/v1
kind: Job
metadata:
labels:
app: nfd
name: nfd-prune
spec:
completions: 1
template:
metadata:
labels:
app: nfd-prune
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: In
values:
- ""
weight: 1
- preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: In
values:
- ""
weight: 1
containers:
- args:
- -prune
command:
- nfd-master
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
image: $(NODE_FEATURE_DISCOVERY_IMAGE)
imagePullPolicy: Always
name: nfd-master
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
restartPolicy: Never
serviceAccount: nfd-master
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
value: ""
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
value: ""
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.8.0
api-approved.kubernetes.io: unapproved, experimental-only
creationTimestamp: null
name: nodefeaturediscoveries.nfd.kubernetes.io
spec:
Expand Down Expand Up @@ -71,6 +72,13 @@ spec:
listens for incoming requests.
type: integer
type: object
prunerOnDelete:
description: PruneOnDelete defines whether the NFD-master prune should
be enabled or not. If enabled, the Operator will deploy an NFD-Master
prune job that will remove all NFD labels (and other NFD-managed
assets such as annotations, extended resources and taints) from
the cluster nodes.
type: boolean
resourceLabels:
description: ResourceLabels defines the list of features to be advertised
as extended resources instead of labels.
Expand Down
62 changes: 62 additions & 0 deletions controllers/nodefeaturediscovery_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"strings"

appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -528,6 +529,67 @@ func Deployment(n NFD) (ResourceStatus, error) {
return Ready, nil
}

// Job checks the readiness of a Job and creates one if it doesn't exist
func Job(n NFD) (ResourceStatus, error) {
// state represents the resource's 'control' function index
state := n.idx

// It is assumed that the index has already been verified to be a
// Job object, so let's get the resource's Job object
obj := n.resources[state].Job

// Update the NFD operand image
obj.Spec.Template.Spec.Containers[0].Image = n.ins.Spec.Operand.ImagePath()

// Update the image pull policy
if n.ins.Spec.Operand.ImagePullPolicy != "" {
obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = n.ins.Spec.Operand.ImagePolicy(n.ins.Spec.Operand.ImagePullPolicy)
}

// Set namespace based on the NFD namespace. (And again,
// it is assumed that the Namespace has already been
// determined before this function was called.)
obj.SetNamespace(n.ins.GetNamespace())

// found states if the Job was found
found := &batchv1.Job{}

klog.InfoS("Looking for Job", "name", obj.Name, "namespace", obj.Namespace)

// SetControllerReference sets the owner as a Controller OwnerReference
// and is used for garbage collection of the controlled object. It is
// also used to reconcile the owner object on changes to the controlled
// object. If we cannot set the owner, then return NotReady
if err := controllerutil.SetControllerReference(n.ins, &obj, n.rec.Scheme); err != nil {
return NotReady, err
}

// Look for the Job to see if it exists, and if so, check if it's
// Ready/NotReady. If the Job does not exist, then attempt to
// create it
err := n.rec.Client.Get(context.TODO(), types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
if err != nil && errors.IsNotFound(err) {
klog.InfoS("Job not found, creating", "name", obj.Name, "namespace", obj.Namespace)
err = n.rec.Client.Create(context.TODO(), &obj)
if err != nil {
klog.ErrorS(err, "Couldn't create Job", "name", obj.Name, "namespace", obj.Namespace)
return NotReady, err
}
return Ready, nil
} else if err != nil {
return NotReady, err
}

// If we found the Job, and is Ready, then we're done
if found.Status.Active > 0 {
return NotReady, nil
} else if found.Status.Failed > 0 {
return NotReady, fmt.Errorf("prune Job failed")
}

return Ready, nil
}

// Service checks if a Service exists and creates one if it doesn't exist
func Service(n NFD) (ResourceStatus, error) {
// state represents the resource's 'control' function index
Expand Down
104 changes: 104 additions & 0 deletions controllers/nodefeaturediscovery_finalizers.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ func (r *NodeFeatureDiscoveryReconciler) finalizeNFDOperand(ctx context.Context,
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}

if instance.Spec.PruneOnDelete {
klog.Info("Deleting NFD labels and NodeFeature CRs from cluster")
if err := deployPrune(ctx, r, instance); err != nil {
klog.Error(err, "Failed to delete NFD labels and NodeFeature CRs from cluster")
return ctrl.Result{}, err
}
} else {
klog.Warning("PruneOnDelete is disabled, NFD labels and NodeFeature CRs will not be deleted from cluster")
}

// If all components are deleted, then remove the finalizer
klog.Info("Secondary check passed. Removing finalizer if it exists.")
if r.hasFinalizer(instance, finalizer) {
Expand Down Expand Up @@ -363,6 +373,100 @@ func (r *NodeFeatureDiscoveryReconciler) doComponentsExist(ctx context.Context,
return false
}

// deployPrune deploys nfd-master with --prune option
// to remove labels and NodeFeature CRs
func deployPrune(ctx context.Context, r *NodeFeatureDiscoveryReconciler, instance *nfdv1.NodeFeatureDiscovery) error {
res, ctrl := addResourcesControls("/opt/nfd/prune")
n := NFD{
rec: r,
ins: instance,
idx: 0,
}

n.controls = append(n.controls, ctrl)
n.resources = append(n.resources, res)

// Run through all control functions, return an error on any NotReady resource.
for {
err := n.step()
if err != nil {
return err
}
if n.last() {
break
}
}

// wait until job is finished and then delete it
err := wait.Poll(RetryInterval, time.Minute*3, func() (done bool, err error) {
job, err := r.getJob(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, err
}
if job.Status.Succeeded > 0 {
return true, nil
}
return false, nil
})
if err != nil {
return err
}

// delete job and RBAC objects
// Attempt to delete the Job
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
err = r.deleteJob(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, interpretError(err, "Prune Job")
}
klog.Info("nfd-prune Job resource has been deleted.")
return true, nil
})
if err != nil {
return err
}
// Attempt to delete the ServiceAccount
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
err = r.deleteServiceAccount(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, interpretError(err, "Prune ServiceAccount")
}
klog.Info("nfd-prune ServiceAccount resource has been deleted.")
return true, nil
})
if err != nil {
return err
}

// Attempt to delete the ClusterRole
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
err = r.deleteClusterRole(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, interpretError(err, "Prune ClusterRole")
}
klog.Info("nfd-prune ClusterRole resource has been deleted.")
return true, nil
})
if err != nil {
return err
}

// Attempt to delete the ClusterRoleBinding
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
err = r.deleteClusterRoleBinding(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, interpretError(err, "Prune ClusterRoleBinding")
}
klog.Info("nfd-prune ClusterRoleBinding resource has been deleted.")
return true, nil
})
if err != nil {
return err
}

return nil
}

// interpretError determines if a resource has already been
// (successfully) deleted
func interpretError(err error, resourceName string) error {
Expand Down
29 changes: 29 additions & 0 deletions controllers/nodefeaturediscovery_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"strings"

appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -46,6 +47,7 @@ type Resources struct {
ClusterRoleBinding rbacv1.ClusterRoleBinding
ConfigMap corev1.ConfigMap
DaemonSet appsv1.DaemonSet
Job batchv1.Job
Deployment appsv1.Deployment
Pod corev1.Pod
Service corev1.Service
Expand Down Expand Up @@ -143,6 +145,10 @@ func addResourcesControls(path string) (Resources, controlFunc) {
_, _, err := s.Decode(m, nil, &res.Deployment)
panicIfError(err)
ctrl = append(ctrl, Deployment)
case "Job":
_, _, err := s.Decode(m, nil, &res.Job)
panicIfError(err)
ctrl = append(ctrl, Job)
case "Service":
_, _, err := s.Decode(m, nil, &res.Service)
panicIfError(err)
Expand Down Expand Up @@ -184,6 +190,13 @@ func (r *NodeFeatureDiscoveryReconciler) getDeployment(ctx context.Context, name
return d, err
}

// getJob gets one of the NFD Operand's Job
func (r *NodeFeatureDiscoveryReconciler) getJob(ctx context.Context, namespace string, name string) (*batchv1.Job, error) {
j := &batchv1.Job{}
err := r.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, j)
return j, err
}

// getConfigMap gets one of the NFD Operand's ConfigMap
func (r *NodeFeatureDiscoveryReconciler) getConfigMap(ctx context.Context, namespace string, name string) (*corev1.ConfigMap, error) {
cm := &corev1.ConfigMap{}
Expand Down Expand Up @@ -290,6 +303,22 @@ func (r *NodeFeatureDiscoveryReconciler) deleteDeployment(ctx context.Context, n
return r.Delete(context.TODO(), d)
}

// deleteJob deletes Operand job
func (r *NodeFeatureDiscoveryReconciler) deleteJob(ctx context.Context, namespace string, name string) error {
j, err := r.getJob(ctx, namespace, name)

// Do not return an error if the object has already been deleted
if k8serrors.IsNotFound(err) {
return nil
}

if err != nil {
return err
}

return r.Delete(context.TODO(), j)
}

// deleteService deletes the NFD Operand's Service
func (r *NodeFeatureDiscoveryReconciler) deleteService(ctx context.Context, namespace string, name string) error {
svc, err := r.getService(ctx, namespace, name)
Expand Down
1 change: 1 addition & 0 deletions controllers/nodefeaturediscovery_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ const (
nfdWorkerApp string = "nfd-worker"
nfdMasterApp string = "nfd-master"
nfdTopologyUpdaterApp string = "nfd-topology-updater"
nfdPruneApp string = "nfd-prune"
)

const (
Expand Down
Loading