Skip to content

Commit

Permalink
NO-JIRA: (refactor) job completion uses event instead polling (#888)
Browse files Browse the repository at this point in the history
* refactor: job completion uses event instead polling

* chore: add watch permission to batch

* fix: handle event job as nil

* fix: listen only modified event

* fix: missing job name log

* fix: clean up debug stuff

* fix: handle event job casting fail

* refactor: codereview feedback
  • Loading branch information
Ricardo Lüders authored Mar 5, 2024
1 parent 82cb56f commit 0b8a79c
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 18 deletions.
1 change: 1 addition & 0 deletions manifests/03-clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ rules:
- get
- list
- delete
- watch
- apiGroups:
- apps
resources:
Expand Down
51 changes: 34 additions & 17 deletions pkg/controller/periodic/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@ package periodic
import (
"context"
"fmt"
"time"

batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes"
)

Expand Down Expand Up @@ -108,19 +106,38 @@ func (j *JobController) CreateGathererJob(ctx context.Context, dataGatherName, i
return j.kubeClient.BatchV1().Jobs(insightsNamespace).Create(ctx, gj, metav1.CreateOptions{})
}

// WaitForJobCompletion polls the Kubernetes API every 20 seconds and checks if the job finished.
func (j *JobController) WaitForJobCompletion(ctx context.Context, job *batchv1.Job) error {
return wait.PollUntilContextCancel(ctx, 20*time.Second, true, func(ctx context.Context) (done bool, err error) {
j, err := j.kubeClient.BatchV1().Jobs(insightsNamespace).Get(ctx, job.Name, metav1.GetOptions{})
if errors.IsNotFound(err) {
return false, err
}
if j.Status.Succeeded > 0 {
return true, nil
}
if j.Status.Failed > 0 {
return true, fmt.Errorf("job %s failed", job.Name)
// WaitForJobCompletion listen the Kubernetes events to check if job finished.
func (j *JobController) WaitForJobCompletion(ctx context.Context, jobName string) error {
watcher, err := j.kubeClient.BatchV1().Jobs(insightsNamespace).
Watch(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("metadata.name=%s", jobName)})
if err != nil {
return err
}
defer watcher.Stop()

for {
select {
case <-ctx.Done():
return ctx.Err()
case event, ok := <-watcher.ResultChan():
if !ok {
return fmt.Errorf("watcher channel was closed unexpectedly")
}

if event.Type != watch.Modified {
continue
}

job, ok := event.Object.(*batchv1.Job)
if !ok {
return fmt.Errorf("failed to cast job event: %v", event.Object)
}
if job.Status.Succeeded > 0 {
return nil
}
if job.Status.Failed > 0 {
return fmt.Errorf("job %s failed", job.Name)
}
}
return false, nil
})
}
}
2 changes: 1 addition & 1 deletion pkg/controller/periodic/periodic.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ func (c *Controller) runJobAndCheckResults(ctx context.Context, dataGather *insi
}

klog.Infof("Created new gathering job %v", gj.Name)
err = c.jobController.WaitForJobCompletion(ctx, gj)
err = c.jobController.WaitForJobCompletion(ctx, gj.Name)
if err != nil {
if errors.Is(err, context.DeadlineExceeded) {
klog.Errorf("Failed to read job status: %v", err)
Expand Down

0 comments on commit 0b8a79c

Please sign in to comment.