Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding metric unavailability to events #864

Merged
merged 3 commits into from
Oct 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions manifests/v1alpha3/katib-controller/crd-trial.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ metadata:
spec:
additionalPrinterColumns:
- JSONPath: .status.conditions[-1:].type
name: Type
type: string
- JSONPath: .status.conditions[-1:].status
name: Status
type: string
- JSONPath: .metadata.creationTimestamp
Expand Down
4 changes: 2 additions & 2 deletions pkg/apis/controller/trials/v1alpha3/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,12 @@ func (trial *Trial) MarkTrialStatusRunning(reason, message string) {
trial.setCondition(TrialRunning, v1.ConditionTrue, reason, message)
}

func (trial *Trial) MarkTrialStatusSucceeded(reason, message string) {
func (trial *Trial) MarkTrialStatusSucceeded(status v1.ConditionStatus, reason, message string) {
currentCond := getCondition(trial, TrialRunning)
if currentCond != nil {
trial.setCondition(TrialRunning, v1.ConditionFalse, currentCond.Reason, currentCond.Message)
}
trial.setCondition(TrialSucceeded, v1.ConditionTrue, reason, message)
trial.setCondition(TrialSucceeded, status, reason, message)

}

Expand Down
4 changes: 1 addition & 3 deletions pkg/controller.v1alpha3/trial/trial_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,8 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1alpha3.Trial) error {
// if job has succeded and if observation field is available.
// if job has failed
// This will ensure that trial is set to be complete only if metric is collected at least once
if isTrialComplete(instance, jobCondition) {
r.UpdateTrialStatusCondition(instance, deployedJob, jobCondition)
r.UpdateTrialStatusCondition(instance, deployedJob, jobCondition)

}
}
return nil
}
Expand Down
20 changes: 11 additions & 9 deletions pkg/controller.v1alpha3/trial/trial_controller_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@ const (
DefaultJobKind = "Job"

// For trials
TrialCreatedReason = "TrialCreated"
TrialRunningReason = "TrialRunning"
TrialSucceededReason = "TrialSucceeded"
TrialFailedReason = "TrialFailed"
TrialKilledReason = "TrialKilled"
TrialCreatedReason = "TrialCreated"
TrialRunningReason = "TrialRunning"
TrialSucceededReason = "TrialSucceeded"
TrialMetricsUnavailableReason = "MetricsUnavailable"
TrialFailedReason = "TrialFailed"
TrialKilledReason = "TrialKilled"

// For Jobs
JobCreatedReason = "JobCreated"
JobDeletedReason = "JobDeleted"
JobSucceededReason = "JobSucceeded"
JobFailedReason = "JobFailed"
JobCreatedReason = "JobCreated"
JobDeletedReason = "JobDeleted"
JobSucceededReason = "JobSucceeded"
JobMetricsUnavailableReason = "MetricsUnavailable"
JobFailedReason = "JobFailed"
)
3 changes: 2 additions & 1 deletion pkg/controller.v1alpha3/trial/trial_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/golang/mock/gomock"
"github.com/onsi/gomega"
"golang.org/x/net/context"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
Expand Down Expand Up @@ -222,7 +223,7 @@ func TestReconcileCompletedTFJobTrial(t *testing.T) {
return c.Get(context.TODO(), expectedRequest.NamespacedName, instance)
}, timeout).
Should(gomega.Succeed())
instance.MarkTrialStatusSucceeded("", "")
instance.MarkTrialStatusSucceeded(corev1.ConditionTrue, "", "")
g.Expect(c.Status().Update(context.TODO(), instance)).NotTo(gomega.HaveOccurred())
g.Eventually(func() bool {
err := c.Get(context.TODO(), expectedRequest.NamespacedName, instance)
Expand Down
25 changes: 16 additions & 9 deletions pkg/controller.v1alpha3/trial/trial_controller_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,23 +91,30 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1alpha3.Tri
now := metav1.Now()
jobConditionType := (*jobCondition).Type
if jobConditionType == commonv1.JobSucceeded {
msg := "Trial has succeeded"
instance.MarkTrialStatusSucceeded(TrialSucceededReason, msg)
instance.Status.CompletionTime = &now
if isTrialObservationAvailable(instance) {
msg := "Trial has succeeded"
instance.MarkTrialStatusSucceeded(corev1.ConditionTrue, TrialSucceededReason, msg)
instance.Status.CompletionTime = &now

eventMsg := fmt.Sprintf("Job %s has succeeded", deployedJob.GetName())
r.recorder.Eventf(instance, corev1.EventTypeNormal, JobSucceededReason, eventMsg)
} else {
msg := "Metrics are not available"
instance.MarkTrialStatusSucceeded(corev1.ConditionFalse, TrialMetricsUnavailableReason, msg)

eventMsg := fmt.Sprintf("Metrics are not available for Job %s", deployedJob.GetName())
r.recorder.Eventf(instance, corev1.EventTypeWarning, JobMetricsUnavailableReason, eventMsg)
}
} else if jobConditionType == commonv1.JobFailed {
msg := "Trial has failed"
instance.MarkTrialStatusFailed(TrialFailedReason, msg)
instance.Status.CompletionTime = &now
}
//else nothing to do
if jobConditionType == commonv1.JobSucceeded {
eventMsg := fmt.Sprintf("Job %s has succeeded", deployedJob.GetName())
r.recorder.Eventf(instance, corev1.EventTypeNormal, JobSucceededReason, eventMsg)
} else if jobConditionType == commonv1.JobFailed {

jobConditionMessage := (*jobCondition).Message
eventMsg := fmt.Sprintf("Job %s has failed: %s", deployedJob.GetName(), jobConditionMessage)
r.recorder.Eventf(instance, corev1.EventTypeNormal, JobFailedReason, eventMsg)
}
//else nothing to do
return
}

Expand Down