Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add for tenant silences limit #6605

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458
* [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
* [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590
* [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add the PR id?

* [ENHANCEMENT] Add `compactor.auto-forget-delay` for compactor to auto forget compactors after X minutes without heartbeat. #6533
* [ENHANCEMENT] StoreGateway: Emit more histogram buckets on the `cortex_querier_storegateway_refetches_per_query` metric. #6570
* [ENHANCEMENT] Querier: Apply bytes limiter to LabelNames and LabelValuesForLabelNames. #6568
Expand Down
9 changes: 9 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -3751,6 +3751,15 @@ query_rejection:
# CLI flag: -alertmanager.max-alerts-size-bytes
[alertmanager_max_alerts_size_bytes: <int> | default = 0]

# Maximum number of silences that a single user can have, including expired
# silences. 0 = no limit.
# CLI flag: -alertmanager.max-silences-count
[alertmanager_max_silences_count: <int> | default = 0]

# Maximum size of individual silences that a single user can have. 0 = no limit.
# CLI flag: -alertmanager.max-silences-size-bytes
[alertmanager_max_silences_size_bytes: <int> | default = 0]

# list of rule groups to disable
[disabled_rule_groups: <list of DisabledRuleGroup> | default = []]
```
Expand Down
9 changes: 7 additions & 2 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,16 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
am.groupMarker = memMarker

silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot)

am.silences, err = silence.New(silence.Options{
SnapshotFile: silencesFile,
Retention: cfg.Retention,
Logger: util_log.GoKitLogToSlog(log.With(am.logger, "component", "silences")),
Metrics: am.registry,
Limits: silence.Limits{
MaxSilences: func() int { return cfg.Limits.AlertmanagerMaxSilencesCount(cfg.UserID) },
MaxSilenceSizeBytes: func() int { return cfg.Limits.AlertmanagerMaxSilenceSizeBytes(cfg.UserID) },
},
Logger: util_log.GoKitLogToSlog(log.With(am.logger, "component", "silences")),
Metrics: am.registry,
})
if err != nil {
return nil, fmt.Errorf("failed to create silences: %v", err)
Expand Down
70 changes: 70 additions & 0 deletions pkg/alertmanager/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/go-kit/log"
"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/silence/silencepb"
"github.com/prometheus/alertmanager/types"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
Expand All @@ -19,6 +20,75 @@ import (
"github.com/cortexproject/cortex/pkg/util/test"
)

func TestSilencesLimits(t *testing.T) {
user := "test"

reg := prometheus.NewPedanticRegistry()
maxSilencesCount := 3
maxSilencesSizeBytes := 500
am, err := New(&Config{
UserID: user,
Logger: log.NewNopLogger(),
Limits: &mockAlertManagerLimits{maxSilencesCount: maxSilencesCount, maxSilencesSizeBytes: maxSilencesSizeBytes},
TenantDataDir: t.TempDir(),
ExternalURL: &url.URL{Path: "/am"},
ShardingEnabled: false,
GCInterval: 30 * time.Minute,
}, reg)
require.NoError(t, err)
defer am.StopAndWait()

t.Run("Test maxSilencesCount", func(t *testing.T) {
createSilences := func() *silencepb.Silence {
return &silencepb.Silence{
Matchers: []*silencepb.Matcher{{Name: "name", Pattern: "pattern"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(time.Minute * 30),
}
}

// create silences up to maxSilencesCount
for i := 0; i < maxSilencesCount; i++ {
err := am.silences.Set(createSilences())
require.NoError(t, err)
}

// exceeds limit
err = am.silences.Set(createSilences())
require.Error(t, err)
require.Equal(t, fmt.Sprintf("exceeded maximum number of silences: %d (limit: %d)", maxSilencesCount, maxSilencesCount), err.Error())

// expire whole silences
silences, _, err := am.silences.Query()
require.NoError(t, err)
for _, s := range silences {
err := am.silences.Expire(s.Id)
require.NoError(t, err)
}

// check maxSilencesCount includes expired silences
err = am.silences.Set(createSilences())
require.Error(t, err)
require.Equal(t, fmt.Sprintf("exceeded maximum number of silences: %d (limit: %d)", maxSilencesCount, maxSilencesCount), err.Error())

// GC
n, err := am.silences.GC()
require.NoError(t, err)
require.Equal(t, maxSilencesCount, n)
})
t.Run("Test maxSilencesSizeBytes", func(t *testing.T) {
bigSilences := &silencepb.Silence{
Matchers: []*silencepb.Matcher{{Name: strings.Repeat("a", maxSilencesSizeBytes/2+1), Pattern: strings.Repeat("b", maxSilencesSizeBytes/2+1)}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(time.Minute * 30),
}

err = am.silences.Set(bigSilences)
require.Error(t, err)
require.True(t, strings.Contains(err.Error(), "silence exceeded maximum size"))
})
}

func TestDispatcherGroupLimits(t *testing.T) {
for name, tc := range map[string]struct {
groups int
Expand Down
6 changes: 6 additions & 0 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,12 @@ type Limits interface {
// AlertmanagerMaxAlertsSizeBytes returns total max size of alerts that tenant can have active at the same time. 0 = no limit.
// Size of the alert is computed from alert labels, annotations and generator URL.
AlertmanagerMaxAlertsSizeBytes(tenant string) int

// AlertmanagerMaxSilencesCount returns max number of silences that tenant can have, including expired silences. 0 = no limit.
AlertmanagerMaxSilencesCount(tenant string) int

// AlertmanagerMaxSilenceSizeBytes returns the maximum size of an individual silence. 0 = no limit.
AlertmanagerMaxSilenceSizeBytes(tenant string) int
}

// A MultitenantAlertmanager manages Alertmanager instances for multiple
Expand Down
26 changes: 24 additions & 2 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1777,8 +1777,14 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) {
amConfig.ShardingEnabled = true
}

var limits validation.Limits
flagext.DefaultValues(&limits)

overrides, err := validation.NewOverrides(limits, nil)
require.NoError(t, err)

reg := prometheus.NewPedanticRegistry()
am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg)
am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, overrides, log.NewNopLogger(), reg)
require.NoError(t, err)
defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck

Expand Down Expand Up @@ -1969,8 +1975,14 @@ func TestAlertmanager_StateReplicationWithSharding_InitialSyncFromPeers(t *testi

amConfig.ShardingEnabled = true

var limits validation.Limits
flagext.DefaultValues(&limits)

overrides, err := validation.NewOverrides(limits, nil)
require.NoError(t, err)

reg := prometheus.NewPedanticRegistry()
am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg)
am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, overrides, log.NewNopLogger(), reg)
require.NoError(t, err)

clientPool.setServer(amConfig.ShardingRing.InstanceAddr+":0", am)
Expand Down Expand Up @@ -2285,6 +2297,8 @@ type mockAlertManagerLimits struct {
maxDispatcherAggregationGroups int
maxAlertsCount int
maxAlertsSizeBytes int
maxSilencesCount int
maxSilencesSizeBytes int
}

func (m *mockAlertManagerLimits) AlertmanagerMaxConfigSize(tenant string) int {
Expand Down Expand Up @@ -2326,3 +2340,11 @@ func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsCount(_ string) int {
func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsSizeBytes(_ string) int {
return m.maxAlertsSizeBytes
}

func (m *mockAlertManagerLimits) AlertmanagerMaxSilencesCount(_ string) int {
return m.maxSilencesCount
}

func (m *mockAlertManagerLimits) AlertmanagerMaxSilenceSizeBytes(_ string) int {
return m.maxSilencesSizeBytes
}
12 changes: 12 additions & 0 deletions pkg/util/validation/limits.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ type Limits struct {
AlertmanagerMaxDispatcherAggregationGroups int `yaml:"alertmanager_max_dispatcher_aggregation_groups" json:"alertmanager_max_dispatcher_aggregation_groups"`
AlertmanagerMaxAlertsCount int `yaml:"alertmanager_max_alerts_count" json:"alertmanager_max_alerts_count"`
AlertmanagerMaxAlertsSizeBytes int `yaml:"alertmanager_max_alerts_size_bytes" json:"alertmanager_max_alerts_size_bytes"`
AlertmanagerMaxSilencesCount int `yaml:"alertmanager_max_silences_count" json:"alertmanager_max_silences_count"`
AlertmanagerMaxSilencesSizeBytes int `yaml:"alertmanager_max_silences_size_bytes" json:"alertmanager_max_silences_size_bytes"`
DisabledRuleGroups DisabledRuleGroups `yaml:"disabled_rule_groups" json:"disabled_rule_groups" doc:"nocli|description=list of rule groups to disable"`
}

Expand Down Expand Up @@ -310,6 +312,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
f.IntVar(&l.AlertmanagerMaxDispatcherAggregationGroups, "alertmanager.max-dispatcher-aggregation-groups", 0, "Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxAlertsCount, "alertmanager.max-alerts-count", 0, "Maximum number of alerts that a single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxAlertsSizeBytes, "alertmanager.max-alerts-size-bytes", 0, "Maximum total size of alerts that a single user can have, alert size is the sum of the bytes of its labels, annotations and generatorURL. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxSilencesCount, "alertmanager.max-silences-count", 0, "Maximum number of silences that a single user can have, including expired silences. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxSilencesSizeBytes, "alertmanager.max-silences-size-bytes", 0, "Maximum size of individual silences that a single user can have. 0 = no limit.")
}

// Validate the limits config and returns an error if the validation
Expand Down Expand Up @@ -971,6 +975,14 @@ func (o *Overrides) AlertmanagerMaxAlertsSizeBytes(userID string) int {
return o.GetOverridesForUser(userID).AlertmanagerMaxAlertsSizeBytes
}

func (o *Overrides) AlertmanagerMaxSilencesCount(userID string) int {
return o.GetOverridesForUser(userID).AlertmanagerMaxSilencesCount
}

func (o *Overrides) AlertmanagerMaxSilenceSizeBytes(userID string) int {
return o.GetOverridesForUser(userID).AlertmanagerMaxSilencesSizeBytes
}

func (o *Overrides) DisabledRuleGroups(userID string) DisabledRuleGroups {
if o.tenantLimits != nil {
l := o.tenantLimits.ByUserID(userID)
Expand Down
Loading