Skip to content

Commit

Permalink
Enable automated Prune via CRD NodeFeatureDiscoveries
Browse files Browse the repository at this point in the history
Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
  • Loading branch information
ArangoGutierrez committed Mar 14, 2023
1 parent e736b48 commit 00b2e3c
Show file tree
Hide file tree
Showing 11 changed files with 317 additions and 1 deletion.
6 changes: 6 additions & 0 deletions api/v1/nodefeaturediscovery_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ type NodeFeatureDiscoverySpec struct {
// worker.
// +optional
WorkerConfig ConfigMap `json:"workerConfig"`

// PruneOnDelete defines whether the NFD-master prune should be
// enabled or not. If enabled, the Operator will deploy an NFD-Master prune
// job that will remove all NFD labels from the cluster nodes.
// +optional
PruneOnDelete bool `json:"prunerOnDelete"`
}

// OperandSpec describes configuration options for the operand
Expand Down
4 changes: 4 additions & 0 deletions build/assets/prune/0100_service_account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: nfd-prune
24 changes: 24 additions & 0 deletions build/assets/prune/0200_clusterrole.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nfd-prune
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- patch
- update
- list
- list
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
- nodefeaturerules
verbs:
- get
- list
- watch
13 changes: 13 additions & 0 deletions build/assets/prune/0300_clusterole_binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nfd-prune
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nfd-master
subjects:
- kind: ServiceAccount
name: nfd-master
namespace: node-feature-discovery-operator

61 changes: 61 additions & 0 deletions build/assets/prune/0400_prune_job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: batch/v1
kind: Job
metadata:
labels:
app: nfd
name: nfd-prune
spec:
completions: 1
template:
metadata:
labels:
app: nfd-prune
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: In
values:
- ""
weight: 1
- preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: In
values:
- ""
weight: 1
containers:
- args:
- -prune
command:
- nfd-master
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
image: $(NODE_FEATURE_DISCOVERY_IMAGE)
imagePullPolicy: Always
name: nfd-master
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
restartPolicy: Never
serviceAccount: nfd-master
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
value: ""
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
value: ""
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.8.0
api-approved.kubernetes.io: unapproved, experimental-only
creationTimestamp: null
name: nodefeaturediscoveries.nfd.kubernetes.io
spec:
Expand Down Expand Up @@ -71,6 +72,11 @@ spec:
listens for incoming requests.
type: integer
type: object
prunerOnDelete:
description: PruneOnDelete defines whether the NFD-master prune should
be enabled or not. If enabled, the Operator will deploy an NFD-Master
prune job that will remove all NFD labels from the cluster nodes.
type: boolean
resourceLabels:
description: ResourceLabels defines the list of features to be advertised
as extended resources instead of labels.
Expand Down
62 changes: 62 additions & 0 deletions controllers/nodefeaturediscovery_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"strings"

appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -528,6 +529,67 @@ func Deployment(n NFD) (ResourceStatus, error) {
return Ready, nil
}

// Job checks the readiness of a Job and creates one if it doesn't exist
func Job(n NFD) (ResourceStatus, error) {
// state represents the resource's 'control' function index
state := n.idx

// It is assumed that the index has already been verified to be a
// Job object, so let's get the resource's Job object
obj := n.resources[state].Job

// Update the NFD operand image
obj.Spec.Template.Spec.Containers[0].Image = n.ins.Spec.Operand.ImagePath()

// Update the image pull policy
if n.ins.Spec.Operand.ImagePullPolicy != "" {
obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = n.ins.Spec.Operand.ImagePolicy(n.ins.Spec.Operand.ImagePullPolicy)
}

// Set namespace based on the NFD namespace. (And again,
// it is assumed that the Namespace has already been
// determined before this function was called.)
obj.SetNamespace(n.ins.GetNamespace())

// found states if the Job was found
found := &batchv1.Job{}

klog.InfoS("Looking for Job", "name", obj.Name, "namespace", obj.Namespace)

// SetControllerReference sets the owner as a Controller OwnerReference
// and is used for garbage collection of the controlled object. It is
// also used to reconcile the owner object on changes to the controlled
// object. If we cannot set the owner, then return NotReady
if err := controllerutil.SetControllerReference(n.ins, &obj, n.rec.Scheme); err != nil {
return NotReady, err
}

// Look for the Job to see if it exists, and if so, check if it's
// Ready/NotReady. If the DaemonSet does not exist, then attempt to
// create it
err := n.rec.Client.Get(context.TODO(), types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
if err != nil && errors.IsNotFound(err) {
klog.InfoS("Job not found, creating", "name", obj.Name, "namespace", obj.Namespace)
err = n.rec.Client.Create(context.TODO(), &obj)
if err != nil {
klog.ErrorS(err, "Couldn't create Job", "name", obj.Name, "namespace", obj.Namespace)
return NotReady, err
}
return Ready, nil
} else if err != nil {
return NotReady, err
}

// If we found the Job, let's attempt to update it
klog.InfoS("Job found, updating", "name", obj.Name, "namespace", obj.Namespace)
err = n.rec.Client.Update(context.TODO(), &obj)
if err != nil {
return NotReady, err
}

return Ready, nil
}

// Service checks if a Service exists and creates one if it doesn't exist
func Service(n NFD) (ResourceStatus, error) {
// state represents the resource's 'control' function index
Expand Down
105 changes: 105 additions & 0 deletions controllers/nodefeaturediscovery_finalizers.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ func (r *NodeFeatureDiscoveryReconciler) finalizeNFDOperand(ctx context.Context,
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}

if instance.Spec.PruneOnDelete {
klog.Info("Deleting NFD labels and NodeFeature CRs from cluster")
if err := deployPrune(ctx, r, instance); err != nil {
klog.Error(err, "Failed to delete NFD labels and NodeFeature CRs from cluster")
return ctrl.Result{}, err
}
} else {
klog.Info("PruneOnDelete is disabled, NFD labels and NodeFeature CRs will not be deleted from cluster")
}

// If all components are deleted, then remove the finalizer
klog.Info("Secondary check passed. Removing finalizer if it exists.")
if r.hasFinalizer(instance, finalizer) {
Expand Down Expand Up @@ -363,6 +373,101 @@ func (r *NodeFeatureDiscoveryReconciler) doComponentsExist(ctx context.Context,
return false
}

// deployPrune deploys nfd-master with --prune option
// to remove labels and NodeFeature CRs
func deployPrune(ctx context.Context, r *NodeFeatureDiscoveryReconciler, instance *nfdv1.NodeFeatureDiscovery) error {
res, ctrl := addResourcesControls("/opt/nfd/prune")
n := NFD{
rec: r,
ins: instance,
idx: 0,
}

n.controls = append(n.controls, ctrl)
n.resources = append(n.resources, res)

// Run through all control functions, return an error on any NotReady resource.
for {
err := n.step()
if err != nil {
return err
}
if n.last() {
break
}
}

// wait until job is finished and then delete it
err := wait.Poll(RetryInterval, time.Minute*3, func() (done bool, err error) {
job, err := r.getJob(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, err
}
if job.Status.Succeeded > 0 {
return true, nil
}
return false, nil
})
if err != nil {
return err
}

// delete job
// Attempt to delete the ServiceAccount
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
err = r.deleteServiceAccount(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, interpretError(err, "Prune ServiceAccount")
}
klog.Info("nfd-prune ServiceAccount resource has been deleted.")
return true, nil
})
if err != nil {
return err
}

// Attempt to delete the ClusterRole
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
err = r.deleteClusterRole(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, interpretError(err, "Prune ClusterRole")
}
klog.Info("nfd-prune ClusterRole resource has been deleted.")
return true, nil
})
if err != nil {
return err
}

// Attempt to delete the ClusterRoleBinding
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
err = r.deleteClusterRoleBinding(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, interpretError(err, "Prune ClusterRoleBinding")
}
klog.Info("nfd-prune ClusterRoleBinding resource has been deleted.")
return true, nil
})
if err != nil {
return err
}

// Attempt to delete the Job
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
err = r.deleteJob(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
if err != nil {
return false, interpretError(err, "Prune Job")
}
klog.Info("nfd-prune Job resource has been deleted.")
return true, nil
})
if err != nil {
return err
}

return nil
}

// interpretError determines if a resource has already been
// (successfully) deleted
func interpretError(err error, resourceName string) error {
Expand Down
29 changes: 29 additions & 0 deletions controllers/nodefeaturediscovery_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"strings"

appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -46,6 +47,7 @@ type Resources struct {
ClusterRoleBinding rbacv1.ClusterRoleBinding
ConfigMap corev1.ConfigMap
DaemonSet appsv1.DaemonSet
Job batchv1.Job
Deployment appsv1.Deployment
Pod corev1.Pod
Service corev1.Service
Expand Down Expand Up @@ -143,6 +145,10 @@ func addResourcesControls(path string) (Resources, controlFunc) {
_, _, err := s.Decode(m, nil, &res.Deployment)
panicIfError(err)
ctrl = append(ctrl, Deployment)
case "Job":
_, _, err := s.Decode(m, nil, &res.Job)
panicIfError(err)
ctrl = append(ctrl, Job)
case "Service":
_, _, err := s.Decode(m, nil, &res.Service)
panicIfError(err)
Expand Down Expand Up @@ -184,6 +190,13 @@ func (r *NodeFeatureDiscoveryReconciler) getDeployment(ctx context.Context, name
return d, err
}

// getJob gets one of the NFD Operand's Job
func (r *NodeFeatureDiscoveryReconciler) getJob(ctx context.Context, namespace string, name string) (*batchv1.Job, error) {
j := &batchv1.Job{}
err := r.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, j)
return j, err
}

// getConfigMap gets one of the NFD Operand's ConfigMap
func (r *NodeFeatureDiscoveryReconciler) getConfigMap(ctx context.Context, namespace string, name string) (*corev1.ConfigMap, error) {
cm := &corev1.ConfigMap{}
Expand Down Expand Up @@ -290,6 +303,22 @@ func (r *NodeFeatureDiscoveryReconciler) deleteDeployment(ctx context.Context, n
return r.Delete(context.TODO(), d)
}

// deleteJob deletes Operand job
func (r *NodeFeatureDiscoveryReconciler) deleteJob(ctx context.Context, namespace string, name string) error {
j, err := r.getJob(ctx, namespace, name)

// Do not return an error if the object has already been deleted
if k8serrors.IsNotFound(err) {
return nil
}

if err != nil {
return err
}

return r.Delete(context.TODO(), j)
}

// deleteService deletes the NFD Operand's Service
func (r *NodeFeatureDiscoveryReconciler) deleteService(ctx context.Context, namespace string, name string) error {
svc, err := r.getService(ctx, namespace, name)
Expand Down
Loading

0 comments on commit 00b2e3c

Please sign in to comment.