diff --git a/CHANGELOG.md b/CHANGELOG.md index 32da81425fa..3309d6dff11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ * [ENHANCEMENT] Ingester: Add support for exporting native histogram cost attribution metrics (`cortex_ingester_attributed_active_native_histogram_series` and `cortex_ingester_attributed_active_native_histogram_buckets`) with labels specified by customers to a custom Prometheus registry. #10892 * [ENHANCEMENT] Store-gateway: Download sparse headers uploaded by compactors. Compactors have to be configured with `-compactor.upload-sparse-index-headers=true` option. #10879 * [ENHANCEMENT] Compactor: Upload block index file and multiple segment files concurrently. Concurrency scales linearly with block size up to `-compactor.max-per-block-upload-concurrency`. #10947 +* [ENHANCEMENT] Ingester: Add per-user `cortex_ingester_tsdb_wal_replay_unknown_refs_total` and `cortex_ingester_tsdb_wbl_replay_unknown_refs_total` metrics to track unknown series references during WAL/WBL replay. #10981 * [BUGFIX] OTLP: Fix response body and Content-Type header to align with spec. #10852 * [BUGFIX] Compactor: fix issue where block becomes permanently stuck when the Compactor's block cleanup job partially deletes a block. #10888 * [BUGFIX] Storage: fix intermittent failures in S3 upload retries. #10952 diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index fd8b67b67b1..f233dd4e2a2 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -524,6 +524,9 @@ type tsdbMetrics struct { memSeriesCreatedTotal *prometheus.Desc memSeriesRemovedTotal *prometheus.Desc + tsdbWalReplayUnknownRefsTotal *prometheus.Desc + tsdbWblReplayUnknownRefsTotal *prometheus.Desc + headPostingsForMatchersCacheMetrics *tsdb.PostingsForMatchersCacheMetrics blockPostingsForMatchersCacheMetrics *tsdb.PostingsForMatchersCacheMetrics @@ -707,6 +710,15 @@ func newTSDBMetrics(r prometheus.Registerer, logger log.Logger) *tsdbMetrics { "The total number of series that were removed per user.", []string{"user"}, nil), + tsdbWalReplayUnknownRefsTotal: prometheus.NewDesc( + "cortex_ingester_tsdb_wal_replay_unknown_refs_total", + "Total number of unknown series references encountered during WAL replay.", + []string{"user", "type"}, nil), + tsdbWblReplayUnknownRefsTotal: prometheus.NewDesc( + "cortex_ingester_tsdb_wbl_replay_unknown_refs_total", + "Total number of unknown series references encountered during WBL replay.", + []string{"user", "type"}, nil), + headPostingsForMatchersCacheMetrics: tsdb.NewPostingsForMatchersCacheMetrics(prometheus.WrapRegistererWithPrefix("cortex_ingester_tsdb_head_", r)), blockPostingsForMatchersCacheMetrics: tsdb.NewPostingsForMatchersCacheMetrics(prometheus.WrapRegistererWithPrefix("cortex_ingester_tsdb_block_", r)), } @@ -762,6 +774,9 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) { out <- sm.memSeries out <- sm.memSeriesCreatedTotal out <- sm.memSeriesRemovedTotal + + out <- sm.tsdbWalReplayUnknownRefsTotal + out <- sm.tsdbWblReplayUnknownRefsTotal } func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { @@ -804,12 +819,12 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfGaugesPerTenant(out, sm.tsdbExemplarSeriesInStorage, "prometheus_tsdb_exemplar_series_with_exemplars_in_storage") data.SendSumOfGaugesPerTenant(out, sm.tsdbExemplarLastTs, "prometheus_tsdb_exemplar_last_exemplars_timestamp_seconds") data.SendSumOfCounters(out, sm.tsdbExemplarsOutOfOrder, "prometheus_tsdb_exemplar_out_of_order_exemplars_total") - data.SendSumOfCountersPerTenant(out, sm.tsdbOOOAppendedSamples, "prometheus_tsdb_head_out_of_order_samples_appended_total") - data.SendSumOfGauges(out, sm.memSeries, "prometheus_tsdb_head_series") data.SendSumOfCountersPerTenant(out, sm.memSeriesCreatedTotal, "prometheus_tsdb_head_series_created_total") data.SendSumOfCountersPerTenant(out, sm.memSeriesRemovedTotal, "prometheus_tsdb_head_series_removed_total") + data.SendSumOfCountersPerTenant(out, sm.tsdbWalReplayUnknownRefsTotal, "prometheus_tsdb_wal_replay_unknown_refs_total", dskit_metrics.WithLabels("type")) + data.SendSumOfCountersPerTenant(out, sm.tsdbWblReplayUnknownRefsTotal, "prometheus_tsdb_wbl_replay_unknown_refs_total", dskit_metrics.WithLabels("type")) } func (sm *tsdbMetrics) setRegistryForUser(userID string, registry *prometheus.Registry) { diff --git a/pkg/ingester/metrics_test.go b/pkg/ingester/metrics_test.go index 98364c2e4e4..6e79d07ed6d 100644 --- a/pkg/ingester/metrics_test.go +++ b/pkg/ingester/metrics_test.go @@ -217,7 +217,7 @@ func TestTSDBMetrics(t *testing.T) { # HELP cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total Total number of out-of-order exemplar ingestion failed attempts. # TYPE cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total counter cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total 9 - + # HELP cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage Number of TSDB series with exemplars currently in storage. # TYPE cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage gauge cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{user="user1"} 1 @@ -289,6 +289,21 @@ func TestTSDBMetrics(t *testing.T) { cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="max-items-reached"} 0 cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="ttl-expired"} 0 cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="unknown"} 0 + + # HELP cortex_ingester_tsdb_wal_replay_unknown_refs_total Total number of unknown series references encountered during WAL replay. + # TYPE cortex_ingester_tsdb_wal_replay_unknown_refs_total counter + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user1"} 12345 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user1"} 24690 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user2"} 85787 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user2"} 171574 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user3"} 999 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user3"} 1998 + + # HELP cortex_ingester_tsdb_wbl_replay_unknown_refs_total Total number of unknown series references encountered during WBL replay. + # TYPE cortex_ingester_tsdb_wbl_replay_unknown_refs_total counter + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user1"} 12345 + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user2"} 85787 + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user3"} 999 `)) require.NoError(t, err) } @@ -488,7 +503,7 @@ func TestTSDBMetricsWithRemoval(t *testing.T) { # HELP cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total Total number of out-of-order exemplar ingestion failed attempts. # TYPE cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total counter cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total 9 - + # HELP cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage Number of TSDB series with exemplars currently in storage. # TYPE cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage gauge cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{user="user1"} 1 @@ -557,6 +572,18 @@ func TestTSDBMetricsWithRemoval(t *testing.T) { cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="max-items-reached"} 0 cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="ttl-expired"} 0 cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="unknown"} 0 + + # HELP cortex_ingester_tsdb_wal_replay_unknown_refs_total Total number of unknown series references encountered during WAL replay. + # TYPE cortex_ingester_tsdb_wal_replay_unknown_refs_total counter + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user1"} 12345 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user1"} 24690 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user2"} 85787 + cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user2"} 171574 + + # HELP cortex_ingester_tsdb_wbl_replay_unknown_refs_total Total number of unknown series references encountered during WBL replay. + # TYPE cortex_ingester_tsdb_wbl_replay_unknown_refs_total counter + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user1"} 12345 + cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user2"} 85787 `)) require.NoError(t, err) } @@ -815,5 +842,18 @@ func populateTSDBMetrics(base float64) *prometheus.Registry { }) chunksMmappedTotal.Add(30 * base) + tsdbWalReplayUnknownRefsTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_replay_unknown_refs_total", + Help: "Total number of unknown series references encountered during WAL replay.", + }, []string{"type"}) + tsdbWalReplayUnknownRefsTotal.WithLabelValues("series").Add(base) + tsdbWalReplayUnknownRefsTotal.WithLabelValues("samples").Add(base * 2) + + tsdbWblReplayUnknownRefsTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wbl_replay_unknown_refs_total", + Help: "Total number of unknown series references encountered during WBL replay pprus.", + }, []string{"type"}) + tsdbWblReplayUnknownRefsTotal.WithLabelValues("exemplars").Add(base) + return r }