Skip to content

Commit

Permalink
POC: cost attribution proposal 1.2
Browse files Browse the repository at this point in the history
Signed-off-by: Ying WANG <ying.wang@grafana.com>
  • Loading branch information
ying-jeanne committed Oct 24, 2024
1 parent 5162b25 commit e7bf88c
Show file tree
Hide file tree
Showing 13 changed files with 726 additions and 67 deletions.
55 changes: 55 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -4292,6 +4292,28 @@
"fieldType": "int",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "cost_attribution_labels",
"required": false,
"desc": "List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "validation.cost-attribution-labels",
"fieldType": "string",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_cost_attribution_per_user",
"required": false,
"desc": "Maximum number of cost attribution labels allowed per user.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "validation.max-cost-attribution-per-user",
"fieldType": "int",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "ruler_evaluation_delay_duration",
Expand Down Expand Up @@ -18128,6 +18150,17 @@
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "custom_registry_path",
"required": false,
"desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "custom-registry-path",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "timeseries_unmarshal_caching_optimization_enabled",
Expand All @@ -18138,6 +18171,28 @@
"fieldFlag": "timeseries-unmarshal-caching-optimization-enabled",
"fieldType": "boolean",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "cost_attribution_eviction_interval",
"required": false,
"desc": "Time interval at which inactive cost attributions will be evicted from the cache.",
"fieldValue": null,
"fieldDefaultValue": 1800000000000,
"fieldFlag": "cost-attribution-eviction-interval",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "cost_attribution_cool_down_duration",
"required": false,
"desc": "Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache.",
"fieldValue": null,
"fieldDefaultValue": 1200000000000,
"fieldFlag": "cost-attribution-cool-down-duration",
"fieldType": "duration",
"fieldCategory": "experimental"
}
],
"fieldValue": null,
Expand Down
10 changes: 10 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -1133,6 +1133,12 @@ Usage of ./cmd/mimir/mimir:
Expands ${var} or $var in config according to the values of the environment variables.
-config.file value
Configuration file to load.
-cost-attribution-cool-down-duration duration
[experimental] Duration during which any cost attribution for a user will be marked as __overflow__ after exceeding the specified limit, prior to resetting the cache. (default 20m0s)
-cost-attribution-eviction-interval duration
[experimental] Time interval at which inactive cost attributions will be evicted from the cache. (default 30m0s)
-custom-registry-path string
Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.
-debug.block-profile-rate int
Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable.
-debug.mutex-profile-fraction int
Expand Down Expand Up @@ -3059,10 +3065,14 @@ Usage of ./cmd/mimir/mimir:
Enable anonymous usage reporting. (default true)
-usage-stats.installation-mode string
Installation mode. Supported values: custom, helm, jsonnet. (default "custom")
-validation.cost-attribution-labels comma-separated-list-of-strings
[experimental] List of labels used to define the cost attribution. This label will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_received_samples_total, cortex_ingester_active_series and cortex_discarded_samples_attribution_total. Set to an empty string to disable cost attribution.
-validation.create-grace-period duration
Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m)
-validation.enforce-metadata-metric-name
Enforce every metadata has a metric name. (default true)
-validation.max-cost-attribution-per-user int
[experimental] Maximum number of cost attribution labels allowed per user.
-validation.max-label-names-per-series int
Maximum number of label names per series. (default 30)
-validation.max-length-label-name int
Expand Down
159 changes: 159 additions & 0 deletions pkg/costattribution/manager.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package costattribution

import (
"context"
"sort"
"sync"
"time"

"github.com/go-kit/log"
"github.com/grafana/dskit/services"
"github.com/prometheus/client_golang/prometheus"

"github.com/grafana/mimir/pkg/util/validation"
)

const (
missingValue = "__missing__"
overflowValue = "__overflow__"
)

type Manager struct {
services.Service
logger log.Logger
inactiveTimeout time.Duration
limits *validation.Overrides
cooldownTimeout time.Duration

// mu protects the trackersByUserID map
tlock sync.RWMutex
trackersByUserID map[string]*Tracker
}

// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series.
// It will clean up inactive series and update the cost attribution of series every 3 minutes.
func NewManager(cleanupInterval, inactiveTimeout time.Duration, cooldownTimeout time.Duration, logger log.Logger, limits *validation.Overrides) *Manager {
s := &Manager{
trackersByUserID: make(map[string]*Tracker),
limits: limits,
tlock: sync.RWMutex{},
cooldownTimeout: cooldownTimeout,
inactiveTimeout: inactiveTimeout,
logger: logger,
}

s.Service = services.NewTimerService(cleanupInterval, nil, s.iteration, nil).WithName("cost attribution manager")
return s
}

func (m *Manager) iteration(_ context.Context) error {
m.purgeInactiveAttributions(m.inactiveTimeout)
return nil
}

// EnabledForUser returns true if the cost attribution is enabled for the user
func (m *Manager) EnabledForUser(userID string) bool {
return len(m.limits.CostAttributionLabel(userID)) > 0
}

func (m *Manager) TrackerForUser(userID string) *Tracker {
// if cost attribution is not enabled, return nil
if !m.EnabledForUser(userID) {
return nil
}
m.tlock.Lock()
defer m.tlock.Unlock()

// if not exists, create a new tracker
if _, exists := m.trackersByUserID[userID]; !exists {
m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID))
}
return m.trackersByUserID[userID]
}

func (m *Manager) Collect(out chan<- prometheus.Metric) {
m.tlock.RLock()
defer m.tlock.RUnlock()
for _, tracker := range m.trackersByUserID {
tracker.Collect(out)
}
}

// Describe implements prometheus.Collector.
func (m *Manager) Describe(chan<- *prometheus.Desc) {
// this is an unchecked collector
}

// deleteUserTracer is delete user tracker since the user is disabled for cost attribution
func (m *Manager) deleteUserTracer(userID string) {
m.tlock.Lock()
defer m.tlock.Unlock()
if _, exists := m.trackersByUserID[userID]; !exists {
return
}
// clean up tracker metrics and delete the tracker
m.trackersByUserID[userID].cleanupTracker(userID)
delete(m.trackersByUserID, userID)
}

func (m *Manager) purgeInactiveAttributions(inactiveTimeout time.Duration) {

// Get all userIDs from the map
m.tlock.RLock()
userIDs := make([]string, 0, len(m.trackersByUserID))
for userID := range m.trackersByUserID {
userIDs = append(userIDs, userID)
}
m.tlock.RUnlock()

// Iterate over all userIDs and purge inactive attributions of each user
currentTime := time.Now()
for _, userID := range userIDs {
// if cost attribution is not enabled for the user, delete the user tracker and continue
if len(m.limits.CostAttributionLabel(userID)) == 0 || m.limits.MaxCostAttributionPerUser(userID) <= 0 {
m.deleteUserTracer(userID)
continue
}
// get all inactive attributions for the user and clean up the tracker
inactiveObs := m.purgeInactiveObservationsForUser(userID, currentTime.Add(-inactiveTimeout).UnixNano())

for _, ob := range inactiveObs {
m.trackersByUserID[userID].cleanupTrackerAttribution(ob.lvalues)
}
}
}

// compare two sorted string slices
func compareStringSlice(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i, v := range a {
if v != b[i] {
return false
}
}
return true
}

func (m *Manager) purgeInactiveObservationsForUser(userID string, deadline int64) []*observation {
cat := m.TrackerForUser(userID)
if cat == nil {
return nil
}

newTrackedLabels := sort.StringSlice(m.limits.CostAttributionLabel(userID))
// if they are different, we need to update the tracker, we don't mind, just reinitalized the tracker
if !compareStringSlice(cat.trackedLabels, newTrackedLabels) {
m.tlock.Lock()
m.trackersByUserID[userID], _ = newTracker(m.limits.CostAttributionLabel(userID), m.limits.MaxCostAttributionPerUser(userID))
// update the tracker with the new tracker
cat = m.trackersByUserID[userID]
m.tlock.Unlock()
} else if maxCardinality := m.limits.MaxCostAttributionPerUser(userID); cat.maxCardinality != maxCardinality {
// if the maxCardinality is different, update the tracker
cat.updateMaxCardinality(maxCardinality)
}

return cat.PurgeInactiveObservations(deadline)
}
Loading

0 comments on commit e7bf88c

Please sign in to comment.