-
Notifications
You must be signed in to change notification settings - Fork 137
Description
I have forked the repo, reviewed CONTRIBUTING.md, and am now looking at #361 as an example.
I'd like to add a ./pyrra az command with similar behavior to the generate command, but outputs a .bicep file in a format compatible with Azure Managed Prometheus. The format would be similar to https://github.com/Azure/prometheus-collector/blob/main/AddonBicepTemplate/recommendedMetricAlerts.bicep or the example .bicep below that I generated from sloth --> az-prom-rules-converter utility --> az bicep decompile --file $FILE.
If demand for this is low, I recommend that I just maintain the az command in a fork of pyrra. It is quite likely since Google Cloud Observability and Amazon CloudWatch both have managed solutions for creating and managing SLO's, Azure is frantically trying to catch up and build their own managed solution. If a managed SLO solution appears in Azure the value of ./pyrra az would likely have no reason to exist. Stating that the need may be temporary.
Example Input
apiVersion: pyrra.dev/v1alpha1
kind: ServiceLevelObjective
metadata:
name: "somedomain-sb-failures-uat"
namespace: "somedomain"
labels:
prometheus: k8s
role: alert-rules
pyrra.dev/team: somedomain # Any labels prefixed with 'pyrra.dev/' will be propagated as Prometheus labels, while stripping the prefix.
spec:
# We allow failing 15 request every 100 requests (85%).
target: 85.0
window: 4w
description: "SLO based on failures for all SB services in the somedomain namespace."
indicator:
ratio:
errors:
metric: nservicebus_messaging_failures{kubernetes_namespace="somedomain"}
total:
metric: nservicebus_messaging_total{kubernetes_namespace="somedomain"}
grouping:
- app_kubernetes_io_name
- nservicebus_message_type
Example Bicep Output
param location string = resourceGroup().location
@description('Cluster name')
param clusterName string
@description('Action Group ResourceId')
param actionGroupId string
@description('ResourceId of Azure monitor workspace to associate to')
param azureMonitorWorkspace string
resource somedomain_sb_failures_uat_increase 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
name: 'somedomain-sb-failures-uat-increase'
location: location
properties: {
interval: 'PT2M30S'
scopes: [
azureMonitorWorkspace
]
clusterName: clusterName
rules: [
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging:increase4w'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (increase(nservicebus_messaging_total{kubernetes_namespace="somedomain"}[4w]))'
}
{
severity: 3
resolveConfiguration: {
autoResolved: true
timeToResolve: 'PT10M'
}
actions: [
{
actionGroupId: actionGroupId
}
]
alert: 'SLOMetricAbsent'
for: 'PT2M'
labels: {
kubernetes_namespace: 'somedomain'
severity: 'critical'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
expression: 'absent(nservicebus_messaging_total{kubernetes_namespace="somedomain"}) == 1'
}
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging_failures:increase4w'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (increase(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}[4w]))'
}
{
severity: 3
resolveConfiguration: {
autoResolved: true
timeToResolve: 'PT10M'
}
actions: [
{
actionGroupId: actionGroupId
}
]
alert: 'SLOMetricAbsent'
for: 'PT2M'
labels: {
kubernetes_namespace: 'somedomain'
severity: 'critical'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
expression: 'absent(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}) == 1'
}
]
}
}
resource somedomain_sb_failures_uat 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = {
name: 'somedomain-sb-failures-uat'
location: location
properties: {
interval: 'PT30S'
scopes: [
azureMonitorWorkspace
]
clusterName: clusterName
rules: [
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging:burnrate5m'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}[5m])) / sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_total{kubernetes_namespace="somedomain"}[5m]))'
}
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging:burnrate30m'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}[30m])) / sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_total{kubernetes_namespace="somedomain"}[30m]))'
}
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging:burnrate1h'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}[1h])) / sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_total{kubernetes_namespace="somedomain"}[1h]))'
}
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging:burnrate2h'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}[2h])) / sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_total{kubernetes_namespace="somedomain"}[2h]))'
}
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging:burnrate6h'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}[6h])) / sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_total{kubernetes_namespace="somedomain"}[6h]))'
}
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging:burnrate1d'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}[1d])) / sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_total{kubernetes_namespace="somedomain"}[1d]))'
}
{
labels: {
kubernetes_namespace: 'somedomain'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
record: 'nservicebus_messaging:burnrate4d'
expression: 'sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_failures{kubernetes_namespace="somedomain"}[4d])) / sum by (app_kubernetes_io_name, nservicebus_message_type) (rate(nservicebus_messaging_total{kubernetes_namespace="somedomain"}[4d]))'
}
{
severity: 3
resolveConfiguration: {
autoResolved: true
timeToResolve: 'PT10M'
}
actions: [
{
actionGroupId: actionGroupId
}
]
alert: 'ErrorBudgetBurn'
for: 'PT2M'
labels: {
exhaustion: '2d'
kubernetes_namespace: 'somedomain'
long: '1h'
severity: 'critical'
short: '5m'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
expression: 'nservicebus_messaging:burnrate5m{kubernetes_namespace="somedomain",slo="somedomain-sb-failures-uat"} > (14 * (1-0.85)) and nservicebus_messaging:burnrate1h{kubernetes_namespace="somedomain",slo="somedomain-sb-failures-uat"} > (14 * (1-0.85))'
}
{
severity: 3
resolveConfiguration: {
autoResolved: true
timeToResolve: 'PT10M'
}
actions: [
{
actionGroupId: actionGroupId
}
]
alert: 'ErrorBudgetBurn'
for: 'PT15M'
labels: {
exhaustion: '4d'
kubernetes_namespace: 'somedomain'
long: '6h'
severity: 'critical'
short: '30m'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
expression: 'nservicebus_messaging:burnrate30m{kubernetes_namespace="somedomain",slo="somedomain-sb-failures-uat"} > (7 * (1-0.85)) and nservicebus_messaging:burnrate6h{kubernetes_namespace="somedomain",slo="somedomain-sb-failures-uat"} > (7 * (1-0.85))'
}
{
severity: 3
resolveConfiguration: {
autoResolved: true
timeToResolve: 'PT10M'
}
actions: [
{
actionGroupId: actionGroupId
}
]
alert: 'ErrorBudgetBurn'
for: 'PT1H'
labels: {
exhaustion: '2w'
kubernetes_namespace: 'somedomain'
long: '1d'
severity: 'warning'
short: '2h'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
expression: 'nservicebus_messaging:burnrate2h{kubernetes_namespace="somedomain",slo="somedomain-sb-failures-uat"} > (2 * (1-0.85)) and nservicebus_messaging:burnrate1d{kubernetes_namespace="somedomain",slo="somedomain-sb-failures-uat"} > (2 * (1-0.85))'
}
{
severity: 3
resolveConfiguration: {
autoResolved: true
timeToResolve: 'PT10M'
}
actions: [
{
actionGroupId: actionGroupId
}
]
alert: 'ErrorBudgetBurn'
for: 'PT3H'
labels: {
exhaustion: '4w'
kubernetes_namespace: 'somedomain'
long: '4d'
severity: 'warning'
short: '6h'
slo: 'somedomain-sb-failures-uat'
team: 'somedomain'
}
expression: 'nservicebus_messaging:burnrate6h{kubernetes_namespace="somedomain",slo="somedomain-sb-failures-uat"} > (1 * (1-0.85)) and nservicebus_messaging:burnrate4d{kubernetes_namespace="somedomain",slo="somedomain-sb-failures-uat"} > (1 * (1-0.85))'
}
]
}
}