Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update prometheus rules for server, scheduler and executor #329

Merged
merged 1 commit into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 26 additions & 26 deletions internal/controller/install/armadaserver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,21 +212,21 @@ func generateArmadaServerInstallComponents(as *installv1alpha1.ArmadaServer, sch
return nil, err
}

pdb := createPodDisruptionBudget(as)
pdb := createServerPodDisruptionBudget(as)
if err := controllerutil.SetOwnerReference(as, pdb, scheme); err != nil {
return nil, err
}

var pr *monitoringv1.PrometheusRule
var sm *monitoringv1.ServiceMonitor
var prometheusRule *monitoringv1.PrometheusRule
var serviceMonitor *monitoringv1.ServiceMonitor
if as.Spec.Prometheus != nil && as.Spec.Prometheus.Enabled {
pr = createServerPrometheusRule(as.Name, as.Namespace, as.Spec.Prometheus.ScrapeInterval, as.Spec.Labels, as.Spec.Prometheus.Labels)
if err := controllerutil.SetOwnerReference(as, pr, scheme); err != nil {
prometheusRule = createServerPrometheusRule(as)
if err := controllerutil.SetOwnerReference(as, prometheusRule, scheme); err != nil {
return nil, err
}

sm = createServiceMonitor(as)
if err := controllerutil.SetOwnerReference(as, sm, scheme); err != nil {
serviceMonitor = createServerServiceMonitor(as)
if err := controllerutil.SetOwnerReference(as, serviceMonitor, scheme); err != nil {
return nil, err
}
}
Expand All @@ -252,8 +252,8 @@ func generateArmadaServerInstallComponents(as *installv1alpha1.ArmadaServer, sch
ServiceAccount: serviceAccount,
Secret: secret,
PodDisruptionBudget: pdb,
PrometheusRule: pr,
ServiceMonitor: sm,
PrometheusRule: prometheusRule,
ServiceMonitor: serviceMonitor,
Jobs: jobs,
}, nil

Expand Down Expand Up @@ -642,15 +642,15 @@ func createIngressHttp(as *installv1alpha1.ArmadaServer) (*networkingv1.Ingress,
return restIngress, nil
}

func createPodDisruptionBudget(as *installv1alpha1.ArmadaServer) *policyv1.PodDisruptionBudget {
func createServerPodDisruptionBudget(as *installv1alpha1.ArmadaServer) *policyv1.PodDisruptionBudget {
return &policyv1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: as.Name, Namespace: as.Namespace},
Spec: policyv1.PodDisruptionBudgetSpec{},
Status: policyv1.PodDisruptionBudgetStatus{},
}
}

func createServiceMonitor(as *installv1alpha1.ArmadaServer) *monitoringv1.ServiceMonitor {
func createServerServiceMonitor(as *installv1alpha1.ArmadaServer) *monitoringv1.ServiceMonitor {
var prometheusLabels map[string]string
if as.Spec.Prometheus != nil {
prometheusLabels = as.Spec.Prometheus.Labels
Expand Down Expand Up @@ -693,34 +693,34 @@ func (r *ArmadaServerReconciler) SetupWithManager(mgr ctrl.Manager) error {
}

// createServerPrometheusRule will provide a prometheus monitoring rule for the name and scrapeInterval
func createServerPrometheusRule(name, namespace string, scrapeInterval *metav1.Duration, labels ...map[string]string) *monitoringv1.PrometheusRule {
if scrapeInterval == nil {
scrapeInterval = &metav1.Duration{Duration: defaultPrometheusInterval}
}
queueSize := `avg(sum(armada_queue_size) by (queueName, pod)) by (queueName) > 0`
func createServerPrometheusRule(server *installv1alpha1.ArmadaServer) *monitoringv1.PrometheusRule {
queueSize := `max(sum(armada_queue_size) by (queueName, pod)) by (queueName) > 0`
queuePriority := `avg(sum(armada_queue_priority) by (pool, queueName, pod)) by (pool, queueName)`
queueIdeal := `(sum(armada:queue:resource:queued{resourceType="cpu"} > bool 0) by (queueName, pool) * (1 / armada:queue:priority))
/ ignoring(queueName) group_left
sum(sum(armada:queue:resource:queued{resourceType="cpu"} > bool 0) by (queueName, pool) * (1 / armada:queue:priority)) by (pool)
* 100`
queueResourceQueued := `avg(armada_queue_resource_queued) by (pool, queueName, resourceType)`
queueResourceAllocated := `avg(armada_queue_resource_allocated) by (pool, cluster, queueName, resourceType, nodeType)`
queueResourceUsed := `avg(armada_queue_resource_used) by (pool, cluster, queueName, resourceType, nodeType)`
/ ignoring(queueName) group_left sum(sum(armada:queue:resource:queued{resourceType="cpu"} > bool 0) by (queueName, pool) * (1 / armada:queue:priority)) by (pool) * 100`

queueResourceQueued := `max(sum(armada_queue_resource_queued) by (pod, pool, queueName, resourceType)) by (pool, queueName, resourceType)`
queueResourceAllocated := `max(sum(armada_queue_resource_allocated) by (pod, pool, cluster, queueName, resourceType, nodeType)) by (pool, cluster, queueName, resourceType, nodeType)`
queueResourceUsed := `max(sum(armada_queue_resource_used) by (pod, pool, cluster, queueName, resourceType, nodeType)) by (pool, cluster, queueName, resourceType, nodeType)`
serverHist := `histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{grpc_type!="server_stream"}[2m])) by (grpc_method,grpc_service, le))`
serverRequestRate := `sum(rate(grpc_server_handled_total[2m])) by (grpc_method,grpc_service)`
logRate := `sum(rate(log_messages[2m])) by (level)`
availableCapacity := `avg(armada_cluster_available_capacity) by (pool, cluster, resourceType, nodeType)`
resourceCapacity := `avg(armada_cluster_capacity) by (pool, cluster, resourceType, nodeType)`
queuePodPhaseCount := `max(armada_queue_leased_pod_count) by (pool, cluster, queueName, phase, nodeType)`

scrapeInterval := &metav1.Duration{Duration: defaultPrometheusInterval}
if interval := server.Spec.Prometheus.ScrapeInterval; interval != nil {
scrapeInterval = &metav1.Duration{Duration: interval.Duration}
}
durationString := duration.ShortHumanDuration(scrapeInterval.Duration)
objectMetaName := "armada-" + name + "-metrics"
objectMetaName := "armada-" + server.Name + "-metrics"
return &monitoringv1.PrometheusRule{
TypeMeta: metav1.TypeMeta{},
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
Labels: AllLabels(name, labels...),
Name: server.Name,
Namespace: server.Namespace,
Labels: AllLabels(server.Name, server.Labels, server.Spec.Prometheus.Labels),
},
Spec: monitoringv1.PrometheusRuleSpec{
Groups: []monitoringv1.RuleGroup{{
Expand Down
38 changes: 22 additions & 16 deletions internal/controller/install/executor_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,15 @@ func (r *ExecutorReconciler) generateExecutorInstallComponents(executor *install
}

if executor.Spec.Prometheus != nil && executor.Spec.Prometheus.Enabled {
serviceMonitor := r.createServiceMonitor(executor)
if err = controllerutil.SetOwnerReference(executor, serviceMonitor, scheme); err != nil {
components.ServiceMonitor = r.createServiceMonitor(executor)
if err = controllerutil.SetOwnerReference(executor, components.ServiceMonitor, scheme); err != nil {
return nil, err
}
components.ServiceMonitor = serviceMonitor

components.PrometheusRule = createExecutorPrometheusRule(executor.Name, executor.Namespace, executor.Spec.Prometheus.ScrapeInterval, executor.Spec.Labels, executor.Spec.Prometheus.Labels)
components.PrometheusRule = createExecutorPrometheusRule(executor)
if err = controllerutil.SetOwnerReference(executor, components.PrometheusRule, scheme); err != nil {
return nil, err
}
}

return components, nil
Expand Down Expand Up @@ -455,33 +457,37 @@ func (r *ExecutorReconciler) deleteExternalResources(ctx context.Context, compon
}

// createExecutorPrometheusRule will provide a prometheus monitoring rule for the name and scrapeInterval
func createExecutorPrometheusRule(name, namespace string, scrapeInterval *metav1.Duration, labels ...map[string]string) *monitoringv1.PrometheusRule {
if scrapeInterval == nil {
scrapeInterval = &metav1.Duration{Duration: defaultPrometheusInterval}
}
func createExecutorPrometheusRule(executor *installv1alpha1.Executor) *monitoringv1.PrometheusRule {
// Update the restRequestHistogram expression to align with Helm
restRequestHistogram := `histogram_quantile(0.95, ` +
`sum(rate(rest_client_request_duration_seconds_bucket{service="` + name + `"}[2m])) by (endpoint, verb, url, le))`
`sum(rate(rest_client_request_duration_seconds_bucket{service="` + executor.Name + `"}[2m])) by (endpoint, verb, url, le))`
logRate := "sum(rate(log_messages[2m])) by (level)"

// Set the group name and duration string to match the Helm template
scrapeInterval := &metav1.Duration{Duration: defaultPrometheusInterval}
if interval := executor.Spec.Prometheus.ScrapeInterval; interval != nil {
scrapeInterval = &metav1.Duration{Duration: interval.Duration}
}
durationString := duration.ShortHumanDuration(scrapeInterval.Duration)
objectMetaName := "armada-" + name + "-metrics"
objectMetaName := "armada-" + executor.Name + "-metrics"

return &monitoringv1.PrometheusRule{
TypeMeta: metav1.TypeMeta{},
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
Labels: AllLabels(name, labels...),
Name: executor.Name,
Namespace: executor.Namespace,
Labels: AllLabels(executor.Name, executor.Labels, executor.Spec.Prometheus.Labels),
},
Spec: monitoringv1.PrometheusRuleSpec{
Groups: []monitoringv1.RuleGroup{{
Name: objectMetaName,
Interval: ptr.To(monitoringv1.Duration(durationString)),
Rules: []monitoringv1.Rule{
{
Record: "armada:" + name + ":rest:request:histogram95",
Record: "armada:executor:rest:request:histogram95",
Expr: intstr.IntOrString{StrVal: restRequestHistogram},
},
{
Record: "armada:" + name + ":log:rate",
Record: "armada:executor:log:rate",
Expr: intstr.IntOrString{StrVal: logRate},
},
},
Expand Down
Loading
Loading