From 4c0d311a37906ea32f2b49e8a64ac482bd1ec50e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 26 May 2022 11:48:19 +0200 Subject: [PATCH 1/3] Add max query length error to errors catalogue Signed-off-by: Marco Pracucci --- integration/query_frontend_test.go | 3 ++- operations/mimir-mixin/docs/playbooks.md | 15 +++++++++++++++ pkg/frontend/querymiddleware/limits.go | 2 +- .../querymiddleware/querysharding_test.go | 5 +++-- pkg/querier/querier.go | 3 +-- pkg/querier/querier_test.go | 4 ++-- pkg/util/globalerror/errors.go | 2 ++ pkg/util/validation/errors.go | 7 +++++++ pkg/util/validation/errors_test.go | 6 ++++++ pkg/util/validation/limits.go | 18 +++++++++--------- pkg/util/validation/validate.go | 3 --- 11 files changed, 48 insertions(+), 20 deletions(-) diff --git a/integration/query_frontend_test.go b/integration/query_frontend_test.go index f982b2ed4b5..f910d73266f 100644 --- a/integration/query_frontend_test.go +++ b/integration/query_frontend_test.go @@ -31,6 +31,7 @@ import ( "github.com/grafana/mimir/integration/ca" "github.com/grafana/mimir/integration/e2emimir" + "github.com/grafana/mimir/pkg/util/validation" ) type queryFrontendTestConfig struct { @@ -492,7 +493,7 @@ overrides: return c.QueryRangeRaw(`sum_over_time(metric[31d:1s])`, now.Add(-time.Minute), now, time.Minute) }, expStatusCode: http.StatusUnprocessableEntity, - expBody: `{"error":"expanding series: the query time range exceeds the limit (query length: 744h6m0s, limit: 720h0m0s)", "errorType":"execution", "status":"error"}`, + expBody: fmt.Sprintf(`{"error":"expanding series: %s", "errorType":"execution", "status":"error"}`, validation.NewMaxQueryLengthError((744*time.Hour)+(6*time.Minute), 720*time.Hour)), }, { name: "execution error", diff --git a/operations/mimir-mixin/docs/playbooks.md b/operations/mimir-mixin/docs/playbooks.md index 586d67f139d..8bfb81cbf2b 100644 --- a/operations/mimir-mixin/docs/playbooks.md +++ b/operations/mimir-mixin/docs/playbooks.md @@ -1282,6 +1282,21 @@ How to **fix** it: - Consider reducing the time range and/or cardinality of the query. To reduce the cardinality of the query, you can add more label matchers to the query, restricting the set of matching series. - Consider increasing the per-tenant limit by using the `-querier.max-fetched-chunk-bytes-per-query` option (or `max_fetched_chunk_bytes_per_query` in the runtime configuration). +### err-mimir-max-query-length + +This error occurs when the time range of a query exceeds the configured maximum length. + +Both PromQL instant and range queries can fetch metrics data over a period of time. +A [range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) requires a `start` and `end` timestamp, so the difference of `end` minus `start` is the time range length of the query. +An [instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries) requires a `time` parameter and the query is executed fetching samples at that point in time. +However, even an instant query can fetch metrics data over a period of time using the [range vector selectors](https://prometheus.io/docs/prometheus/latest/querying/basics/#range-vector-selectors). +For example, the instant query `sum(rate(http_requests_total{job="prometheus"}[1h]))` fetches metrics over a 1 hour period. +This time period is what Grafana Mimir calls the "query time range length" (or simply "query length"). + +Mimir has a limit on the query length. +This limit is applied on the partial queries, after they've split by time by the query-frontend, and is used to protect the system’s stability from potential abuse or mistakes. +You can configure the limit on a per-tenant basis by using the `-store.max-query-length` option (or `max_query_length` in the runtime configuration). + ## Mimir routes by path **Write path**: diff --git a/pkg/frontend/querymiddleware/limits.go b/pkg/frontend/querymiddleware/limits.go index 3b29e0ede6b..14690efb730 100644 --- a/pkg/frontend/querymiddleware/limits.go +++ b/pkg/frontend/querymiddleware/limits.go @@ -112,7 +112,7 @@ func (l limitsMiddleware) Do(ctx context.Context, r Request) (Response, error) { if maxQueryLength := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, l.MaxQueryLength); maxQueryLength > 0 { queryLen := timestamp.Time(r.GetEnd()).Sub(timestamp.Time(r.GetStart())) if queryLen > maxQueryLength { - return nil, apierror.Newf(apierror.TypeBadData, validation.ErrQueryTooLong, queryLen, maxQueryLength) + return nil, apierror.New(apierror.TypeBadData, validation.NewMaxQueryLengthError(queryLen, maxQueryLength).Error()) } } diff --git a/pkg/frontend/querymiddleware/querysharding_test.go b/pkg/frontend/querymiddleware/querysharding_test.go index cdafcb7a956..f8a16bf9184 100644 --- a/pkg/frontend/querymiddleware/querysharding_test.go +++ b/pkg/frontend/querymiddleware/querysharding_test.go @@ -40,6 +40,7 @@ import ( "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/storage/sharding" "github.com/grafana/mimir/pkg/util" + "github.com/grafana/mimir/pkg/util/validation" ) var ( @@ -1142,7 +1143,7 @@ func TestQuerySharding_ShouldReturnErrorInCorrectFormat(t *testing.T) { return nil, httpgrpc.ErrorFromHTTPResponse(&httpgrpc.HTTPResponse{Code: http.StatusInternalServerError, Body: []byte("fatal queryable error")}) }) queryablePrometheusExecErr = storage.QueryableFunc(func(ctx context.Context, mint, maxt int64) (storage.Querier, error) { - return nil, apierror.New(apierror.TypeExec, "expanding series: the query time range exceeds the limit (query length: 744h6m0s, limit: 720h0m0s") + return nil, apierror.Newf(apierror.TypeExec, "expanding series: %s", validation.NewMaxQueryLengthError(744*time.Hour, 720*time.Hour)) }) queryable = storageSeriesQueryable([]*promql.StorageSeries{ newSeries(labels.Labels{{Name: "__name__", Value: "bar1"}}, start.Add(-lookbackDelta), end, step, factor(5)), @@ -1194,7 +1195,7 @@ func TestQuerySharding_ShouldReturnErrorInCorrectFormat(t *testing.T) { engineDownstream: engine, engineSharding: engineSampleLimit, queryable: queryablePrometheusExecErr, - expError: apierror.New(apierror.TypeExec, "expanding series: the query time range exceeds the limit (query length: 744h6m0s, limit: 720h0m0s"), + expError: apierror.Newf(apierror.TypeExec, "expanding series: %s", validation.NewMaxQueryLengthError(744*time.Hour, 720*time.Hour)), }, } { t.Run(tc.name, func(t *testing.T) { diff --git a/pkg/querier/querier.go b/pkg/querier/querier.go index d1ce26d5c6e..018f2f0f8a4 100644 --- a/pkg/querier/querier.go +++ b/pkg/querier/querier.go @@ -286,8 +286,7 @@ func (q querier) Select(_ bool, sp *storage.SelectHints, matchers ...*labels.Mat // Validate query time range. if maxQueryLength := q.limits.MaxQueryLength(userID); maxQueryLength > 0 && endTime.Sub(startTime) > maxQueryLength { - limitErr := validation.LimitError(fmt.Sprintf(validation.ErrQueryTooLong, endTime.Sub(startTime), maxQueryLength)) - return storage.ErrSeriesSet(limitErr) + return storage.ErrSeriesSet(validation.NewMaxQueryLengthError(endTime.Sub(startTime), maxQueryLength)) } if len(q.queriers) == 1 { diff --git a/pkg/querier/querier_test.go b/pkg/querier/querier_test.go index 2795342fbb8..85673a48392 100644 --- a/pkg/querier/querier_test.go +++ b/pkg/querier/querier_test.go @@ -471,13 +471,13 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLength(t *testing.T) { query: "rate(foo[31d])", queryStartTime: time.Now().Add(-time.Hour), queryEndTime: time.Now(), - expected: errors.New("expanding series: the query time range exceeds the limit (query length: 745h0m0s, limit: 720h0m0s)"), + expected: errors.Errorf("expanding series: %s", validation.NewMaxQueryLengthError(745*time.Hour, 720*time.Hour)), }, "should forbid query on large time range over the limit and short rate time window": { query: "rate(foo[1m])", queryStartTime: time.Now().Add(-maxQueryLength).Add(-time.Hour), queryEndTime: time.Now(), - expected: errors.New("expanding series: the query time range exceeds the limit (query length: 721h1m0s, limit: 720h0m0s)"), + expected: errors.Errorf("expanding series: %s", validation.NewMaxQueryLengthError((721*time.Hour)+time.Minute, 720*time.Hour)), }, } diff --git a/pkg/util/globalerror/errors.go b/pkg/util/globalerror/errors.go index 13a20834875..df6478833bc 100644 --- a/pkg/util/globalerror/errors.go +++ b/pkg/util/globalerror/errors.go @@ -43,6 +43,8 @@ const ( MetricMetadataMetricNameTooLong ID = "metric-name-too-long" MetricMetadataHelpTooLong ID = "help-too-long" MetricMetadataUnitTooLong ID = "unit-too-long" + + MaxQueryLength ID = "max-query-length" ) // Message returns the provided msg, appending the error id. diff --git a/pkg/util/validation/errors.go b/pkg/util/validation/errors.go index eb009a215a2..1f8e2926c4b 100644 --- a/pkg/util/validation/errors.go +++ b/pkg/util/validation/errors.go @@ -8,6 +8,7 @@ package validation import ( "fmt" "strings" + "time" "github.com/prometheus/common/model" @@ -268,6 +269,12 @@ func newMetadataUnitTooLongError(metadata *mimirpb.MetricMetadata) ValidationErr } } +func NewMaxQueryLengthError(actualQueryLen, maxQueryLength time.Duration) LimitError { + return LimitError(globalerror.MaxQueryLength.MessageWithLimitConfig( + maxQueryLengthFlag, + fmt.Sprintf("the query time range exceeds the limit (query length: %s, limit: %s)", actualQueryLen, maxQueryLength))) +} + // formatLabelSet formats label adapters as a metric name with labels, while preserving // label order, and keeping duplicates. If there are multiple "__name__" labels, only // first one is used as metric name, other ones will be included as regular labels. diff --git a/pkg/util/validation/errors_test.go b/pkg/util/validation/errors_test.go index a32d897000e..7cf2879ca28 100644 --- a/pkg/util/validation/errors_test.go +++ b/pkg/util/validation/errors_test.go @@ -4,6 +4,7 @@ package validation import ( "testing" + "time" "github.com/stretchr/testify/assert" @@ -29,3 +30,8 @@ func TestNewMetadataUnitTooLongError(t *testing.T) { err := newMetadataUnitTooLongError(&mimirpb.MetricMetadata{MetricFamilyName: "test_metric", Unit: "counter", Help: "This is a test metric."}) assert.Equal(t, "received a metric metadata whose unit name length exceeds the limit, unit: 'counter' metric name: 'test_metric' (err-mimir-unit-too-long). You can adjust the related per-tenant limit by configuring -validation.max-metadata-length, or by contacting your service administrator.", err.Error()) } + +func TestNewMaxQueryLengthError(t *testing.T) { + err := NewMaxQueryLengthError(time.Hour, time.Minute) + assert.Equal(t, "the query time range exceeds the limit (query length: 1h0m0s, limit: 1m0s) (err-mimir-max-query-length). You can adjust the related per-tenant limit by configuring -store.max-query-length, or by contacting your service administrator.", err.Error()) +} diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index e7135b051db..54b48d98cd5 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -22,19 +22,19 @@ import ( ) const ( - MaxSeriesPerMetricFlag = "ingester.max-global-series-per-metric" - MaxMetadataPerMetricFlag = "ingester.max-global-metadata-per-metric" - MaxSeriesPerUserFlag = "ingester.max-global-series-per-user" - MaxMetadataPerUserFlag = "ingester.max-global-metadata-per-user" - MaxChunksPerQueryFlag = "querier.max-fetched-chunks-per-query" - MaxChunkBytesPerQueryFlag = "querier.max-fetched-chunk-bytes-per-query" - MaxSeriesPerQueryFlag = "querier.max-fetched-series-per-query" - + MaxSeriesPerMetricFlag = "ingester.max-global-series-per-metric" + MaxMetadataPerMetricFlag = "ingester.max-global-metadata-per-metric" + MaxSeriesPerUserFlag = "ingester.max-global-series-per-user" + MaxMetadataPerUserFlag = "ingester.max-global-metadata-per-user" + MaxChunksPerQueryFlag = "querier.max-fetched-chunks-per-query" + MaxChunkBytesPerQueryFlag = "querier.max-fetched-chunk-bytes-per-query" + MaxSeriesPerQueryFlag = "querier.max-fetched-series-per-query" maxLabelNamesPerSeriesFlag = "validation.max-label-names-per-series" maxLabelNameLengthFlag = "validation.max-length-label-name" maxLabelValueLengthFlag = "validation.max-length-label-value" maxMetadataLengthFlag = "validation.max-metadata-length" creationGracePeriodFlag = "validation.create-grace-period" + maxQueryLengthFlag = "store.max-query-length" ) // LimitError are errors that do not comply with the limits specified. @@ -171,7 +171,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and long-term storage. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from each ingesters and storage. This limit is enforced in the querier and ruler. 0 to disable") f.IntVar(&l.MaxFetchedChunkBytesPerQuery, MaxChunkBytesPerQueryFlag, 0, "The maximum size of all chunks in bytes that a query can fetch from each ingester and storage. This limit is enforced in the querier and ruler. 0 to disable.") - f.Var(&l.MaxQueryLength, "store.max-query-length", "Limit the query time range (end - start time). This limit is enforced in the query-frontend (on the received query), in the querier (on the query possibly split by the query-frontend) and ruler. 0 to disable.") + f.Var(&l.MaxQueryLength, maxQueryLengthFlag, "Limit the query time range (end - start time). This limit is enforced in the query-frontend (on the received query), in the querier (on the query possibly split by the query-frontend) and ruler. 0 to disable.") f.Var(&l.MaxQueryLookback, "querier.max-query-lookback", "Limit how long back data (series and metadata) can be queried, up until duration ago. This limit is enforced in the query-frontend, querier and ruler. If the requested time range is outside the allowed range, the request will not fail but will be manipulated to only query data within the allowed time range. 0 to disable.") f.IntVar(&l.MaxQueryParallelism, "querier.max-query-parallelism", 14, "Maximum number of split (by time) or partial (by shard) queries that will be scheduled in parallel by the query-frontend for a single input query. This limit is introduced to have a fairer query scheduling and avoid a single query over a large time range saturating all available queriers.") f.Var(&l.MaxLabelsQueryLength, "store.max-labels-query-length", "Limit the time range (end - start time) of series, label names and values queries. This limit is enforced in the querier. If the requested time range is outside the allowed range, the request will not fail but will be manipulated to only query data within the allowed time range. 0 to disable.") diff --git a/pkg/util/validation/validate.go b/pkg/util/validation/validate.go index 33c91861526..49d700a1ee8 100644 --- a/pkg/util/validation/validate.go +++ b/pkg/util/validation/validate.go @@ -24,9 +24,6 @@ import ( const ( discardReasonLabel = "reason" - // ErrQueryTooLong is used in chunk store, querier and query frontend. - ErrQueryTooLong = "the query time range exceeds the limit (query length: %s, limit: %s)" - // RateLimited is one of the values for the reason to discard samples. // Declared here to avoid duplication in ingester and distributor. RateLimited = "rate_limited" From 2984f5a892786a40aa6bb913a4df688cbb966eda Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 26 May 2022 11:49:42 +0200 Subject: [PATCH 2/3] Added PR number to CHANGELOG entry Signed-off-by: Marco Pracucci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eeb86c71d60..20cf451b516 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ - `-querier.query-store-after` * [CHANGE] Config flag category overrides can be set dynamically at runtime. #1934 * [ENHANCEMENT] Store-gateway: Add the experimental ability to run requests in a dedicated OS thread pool. This feature can be configured using `-store-gateway.thread-pool-size` and is disabled by default. Replaces the ability to run index header operations in a dedicated thread pool. #1660 #1812 -* [ENHANCEMENT] Improved error messages to make them easier to understand and referencing a unique global identifier that can be looked up in the runbooks. #1907 #1919 #1888 +* [ENHANCEMENT] Improved error messages to make them easier to understand and referencing a unique global identifier that can be looked up in the runbooks. #1907 #1919 #1888 #1939 * [ENHANCEMENT] Memberlist KV: incoming messages are now processed on per-key goroutine. This may reduce loss of "maintanance" packets in busy memberlist installations, but use more CPU. New `memberlist_client_received_broadcasts_dropped_total` counter tracks number of dropped per-key messages. #1912 * [ENHANCEMENT] Blocks Storage, Alertmanager, Ruler: add support a prefix to the bucket store (`*_storage.storage_prefix`). This enables using the same bucket for the three components. #1686 * [BUGFIX] Fix regexp parsing panic for regexp label matchers with start/end quantifiers. #1883 From 228251f36577267b79a4105f0b52d7fd0a01352d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 30 May 2022 12:49:45 +0200 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Ursula Kallio --- CHANGELOG.md | 2 +- operations/mimir-mixin/docs/playbooks.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20cf451b516..6d668321ffc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ - `-querier.query-store-after` * [CHANGE] Config flag category overrides can be set dynamically at runtime. #1934 * [ENHANCEMENT] Store-gateway: Add the experimental ability to run requests in a dedicated OS thread pool. This feature can be configured using `-store-gateway.thread-pool-size` and is disabled by default. Replaces the ability to run index header operations in a dedicated thread pool. #1660 #1812 -* [ENHANCEMENT] Improved error messages to make them easier to understand and referencing a unique global identifier that can be looked up in the runbooks. #1907 #1919 #1888 #1939 +* [ENHANCEMENT] Improved error messages to make them easier to understand; each now have a unique, global identifier that you can use to look up in the runbooks for more information. #1907 #1919 #1888 #1939 * [ENHANCEMENT] Memberlist KV: incoming messages are now processed on per-key goroutine. This may reduce loss of "maintanance" packets in busy memberlist installations, but use more CPU. New `memberlist_client_received_broadcasts_dropped_total` counter tracks number of dropped per-key messages. #1912 * [ENHANCEMENT] Blocks Storage, Alertmanager, Ruler: add support a prefix to the bucket store (`*_storage.storage_prefix`). This enables using the same bucket for the three components. #1686 * [BUGFIX] Fix regexp parsing panic for regexp label matchers with start/end quantifiers. #1883 diff --git a/operations/mimir-mixin/docs/playbooks.md b/operations/mimir-mixin/docs/playbooks.md index 8bfb81cbf2b..e253f9cfb34 100644 --- a/operations/mimir-mixin/docs/playbooks.md +++ b/operations/mimir-mixin/docs/playbooks.md @@ -1289,12 +1289,12 @@ This error occurs when the time range of a query exceeds the configured maximum Both PromQL instant and range queries can fetch metrics data over a period of time. A [range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) requires a `start` and `end` timestamp, so the difference of `end` minus `start` is the time range length of the query. An [instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries) requires a `time` parameter and the query is executed fetching samples at that point in time. -However, even an instant query can fetch metrics data over a period of time using the [range vector selectors](https://prometheus.io/docs/prometheus/latest/querying/basics/#range-vector-selectors). +However, even an instant query can fetch metrics data over a period of time by using the [range vector selectors](https://prometheus.io/docs/prometheus/latest/querying/basics/#range-vector-selectors). For example, the instant query `sum(rate(http_requests_total{job="prometheus"}[1h]))` fetches metrics over a 1 hour period. -This time period is what Grafana Mimir calls the "query time range length" (or simply "query length"). +This time period is what Grafana Mimir calls the _query time range length_ (or _query length_). Mimir has a limit on the query length. -This limit is applied on the partial queries, after they've split by time by the query-frontend, and is used to protect the system’s stability from potential abuse or mistakes. +This limit is applied to partial queries, after they've split (according to time) by the query-frontend. This limit protects the system’s stability from potential abuse or mistakes. You can configure the limit on a per-tenant basis by using the `-store.max-query-length` option (or `max_query_length` in the runtime configuration). ## Mimir routes by path