diff --git a/api/models/model_endpoint_alert.go b/api/models/model_endpoint_alert.go index 8575704dc..6a846be39 100644 --- a/api/models/model_endpoint_alert.go +++ b/api/models/model_endpoint_alert.go @@ -35,7 +35,7 @@ const ( const ( throughputSliExprFormat = "round(sum(rate(revision_request_count{cluster_name=\"%s\",namespace_name=\"%s\",revision_name=~\".*%s.*\"}[1m])), 0.001)" - latencySliExprFormat = "histogram_quantile(%f, sum by(le, revision_name) (rate(revision_request_latencies_bucket{cluster_name=\"%s\",namespace_name=\"%s\",revision_name=~\".*%s.*\"}[1m])))" + latencySliExprFormat = "histogram_quantile(%f, sum by (le, revision_name) (rate(revision_request_latencies_bucket{cluster_name=\"%s\",namespace_name=\"%s\",revision_name=~\".*%s.*\"}[1m])))" errorRateSliExprHTTPFormat = "(100 * sum(rate(istio_requests_total{cluster_name=\"%s\",destination_service_name=~\"%s.*\",destination_workload_namespace=\"%s\",response_code!=\"200\",request_protocol=\"http\"}[1m])) / sum(rate(istio_requests_total{cluster_name=\"%s\",destination_service_name=~\"%s.*\",destination_workload_namespace=\"%s\",request_protocol=\"http\"}[1m])))" errorRateSliExprGRPCFormat = "(100 * sum(rate(istio_requests_total{cluster_name=\"%s\",destination_service_name=~\"%s.*\",destination_workload_namespace=\"%s\",grpc_response_status!=\"0\",request_protocol=\"grpc\"}[1m])) / sum(rate(istio_requests_total{cluster_name=\"%s\",destination_service_name=~\"%s.*\",destination_workload_namespace=\"%s\",request_protocol=\"grpc\"}[1m])))" cpuSliExprFormat = "(100 * sum(rate(container_cpu_usage_seconds_total{cluster_name=\"%s\",namespace=\"%s\",pod=~\".*%s.*\",container!~\"|POD\"}[1m])) / sum(kube_pod_container_resource_requests{resource=\"cpu\",cluster_name=\"%s\",namespace=\"%s\",pod=~\".*%s.*\",container!~\"|POD\"}))" diff --git a/api/service/testdata/model_endpoint_alert.yaml b/api/service/testdata/model_endpoint_alert.yaml index 714f5af6f..21ed45b50 100644 --- a/api/service/testdata/model_endpoint_alert.yaml +++ b/api/service/testdata/model_endpoint_alert.yaml @@ -27,7 +27,7 @@ groups: summary: Throughput (RPM) of model-1 model in env-1 is less than 2.00. Current value is {{ $value }}. - alert: "[merlin] model-1: 99.00p Latency warning" expr: |- - histogram_quantile(0.99, sum by(le, revision_name) (rate(revision_request_latencies_bucket{cluster_name="cluster-1",namespace_name="project-1",revision_name=~".*model-1.*"}[1m]))) > 3 + histogram_quantile(0.99, sum by (le, revision_name) (rate(revision_request_latencies_bucket{cluster_name="cluster-1",namespace_name="project-1",revision_name=~".*model-1.*"}[1m]))) > 3 for: 5m labels: owner: team-1 @@ -39,7 +39,7 @@ groups: summary: 99.00p latency of model-1 model ({{ $labels.revision_name }}) in env-1 is higher than 3.00 ms. Current value is {{ $value }} ms. - alert: "[merlin] model-1: 99.00p Latency critical" expr: |- - histogram_quantile(0.99, sum by(le, revision_name) (rate(revision_request_latencies_bucket{cluster_name="cluster-1",namespace_name="project-1",revision_name=~".*model-1.*"}[1m]))) > 4 + histogram_quantile(0.99, sum by (le, revision_name) (rate(revision_request_latencies_bucket{cluster_name="cluster-1",namespace_name="project-1",revision_name=~".*model-1.*"}[1m]))) > 4 for: 5m labels: owner: team-1 @@ -51,7 +51,7 @@ groups: summary: 99.00p latency of model-1 model ({{ $labels.revision_name }}) in env-1 is higher than 4.00 ms. Current value is {{ $value }} ms. - alert: "[merlin] model-1: 95.00p Latency warning" expr: |- - histogram_quantile(0.95, sum by(le, revision_name) (rate(revision_request_latencies_bucket{cluster_name="cluster-1",namespace_name="project-1",revision_name=~".*model-1.*"}[1m]))) > 5 + histogram_quantile(0.95, sum by (le, revision_name) (rate(revision_request_latencies_bucket{cluster_name="cluster-1",namespace_name="project-1",revision_name=~".*model-1.*"}[1m]))) > 5 for: 5m labels: owner: team-1 @@ -63,7 +63,7 @@ groups: summary: 95.00p latency of model-1 model ({{ $labels.revision_name }}) in env-1 is higher than 5.00 ms. Current value is {{ $value }} ms. - alert: "[merlin] model-1: 95.00p Latency critical" expr: |- - histogram_quantile(0.95, sum by(le, revision_name) (rate(revision_request_latencies_bucket{cluster_name="cluster-1",namespace_name="project-1",revision_name=~".*model-1.*"}[1m]))) > 6 + histogram_quantile(0.95, sum by (le, revision_name) (rate(revision_request_latencies_bucket{cluster_name="cluster-1",namespace_name="project-1",revision_name=~".*model-1.*"}[1m]))) > 6 for: 5m labels: owner: team-1