-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
89256: sql: output RU estimate for EXPLAIN ANALYZE on tenants r=DrewKimball a=DrewKimball **sql: surface query request units consumed by network egress** This commit adds a top-level field to the output of `EXPLAIN ANALYZE` that shows the estimated number of RUs that would be consumed due to network egress to the client. The estimate is obtained by measuring the in-memory size of the query result, and passing that to the tenant cost config's `PGWireEgressCost` method. **sql: surface query request units consumed due to cpu usage** This commit adds the ability for clients to estimate the number of RUs consumed by a query due to CPU usage. This is accomplished by keeping a moving average of the CPU usage for the entire tenant process, then using that to obtain an estimate for what the CPU usage *would* be if the query wasn't running. This is then compared against the actual measured CPU usage during the query's execution to get the estimate. For local flows this is done at the `connExecutor` level; for remote flows this is handled by the last outbox on the node (which gathers and sends the flow's metadata). The resulting RU estimate is added to the existing estimate from network egress and displayed in the output of `EXPLAIN ANALYZE`. **sql: surface query request units consumed by IO** This commit adds tracking for request units consumed by IO operations for all execution operators that perform KV operations. The corresponding RU count is recorded in the span and later aggregated with the RU consumption due to network egress and CPU usage. The resulting query RU consumption estimate is visible in the output of `EXPLAIN ANALYZE`. **multitenantccl: add sanity testing for ru estimation** This commit adds a sanity test for the RU estimates produced by running queries with `EXPLAIN ANALYZE` on a tenant. The test runs each test query several times with `EXPLAIN ANALYZE`, then runs all test queries without `EXPLAIN ANALYZE` and compares the resulting actual RU measurement to the aggregated estimates. Informs #74441 Release note (sql change): Added an estimate for the number of request units consumed by a query to the output of `EXPLAIN ANALYZE` for tenant sessions. Co-authored-by: Drew Kimball <drewk@cockroachlabs.com>
- Loading branch information
Showing
37 changed files
with
678 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
193 changes: 193 additions & 0 deletions
193
pkg/ccl/multitenantccl/tenantcostclient/query_ru_estimate_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Licensed as a CockroachDB Enterprise file under the Cockroach Community | ||
// License (the "License"); you may not use this file except in compliance with | ||
// the License. You may obtain a copy of the License at | ||
// | ||
// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt | ||
|
||
package tenantcostclient_test | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"strconv" | ||
"strings" | ||
"testing" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/base" | ||
_ "github.com/cockroachdb/cockroach/pkg/ccl" // ccl init hooks | ||
_ "github.com/cockroachdb/cockroach/pkg/ccl/kvccl/kvtenantccl" | ||
"github.com/cockroachdb/cockroach/pkg/ccl/multitenantccl/tenantcostclient" | ||
_ "github.com/cockroachdb/cockroach/pkg/ccl/multitenantccl/tenantcostserver" | ||
"github.com/cockroachdb/cockroach/pkg/roachpb" | ||
"github.com/cockroachdb/cockroach/pkg/settings/cluster" | ||
"github.com/cockroachdb/cockroach/pkg/sql/stats" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/skip" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" | ||
"github.com/cockroachdb/cockroach/pkg/util/leaktest" | ||
"github.com/cockroachdb/cockroach/pkg/util/log" | ||
"github.com/cockroachdb/cockroach/pkg/util/protoutil" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
// TestEstimateQueryRUConsumption is a sanity check for the RU estimates | ||
// produced for queries that are run by a tenant under EXPLAIN ANALYZE. The RU | ||
// consumption of a query is not deterministic, since it depends on inexact | ||
// quantities like the (already estimated) CPU usage. Therefore, the test runs | ||
// each query multiple times and then checks that the total estimated RU | ||
// consumption is within reasonable distance from the actual measured RUs for | ||
// the tenant. | ||
func TestEstimateQueryRUConsumption(t *testing.T) { | ||
defer leaktest.AfterTest(t)() | ||
defer log.Scope(t).Close(t) | ||
|
||
// This test becomes flaky when the machine/cluster is under significant | ||
// background load, so it should only be run manually. | ||
skip.IgnoreLint(t, "intended to be manually run as a sanity test") | ||
|
||
ctx := context.Background() | ||
|
||
st := cluster.MakeTestingClusterSettings() | ||
stats.AutomaticStatisticsClusterMode.Override(ctx, &st.SV, false) | ||
stats.UseStatisticsOnSystemTables.Override(ctx, &st.SV, false) | ||
stats.AutomaticStatisticsOnSystemTables.Override(ctx, &st.SV, false) | ||
|
||
// Lower the target duration for reporting tenant usage so that it can be | ||
// measured accurately. Avoid decreasing too far, since doing so can add | ||
// measurable overhead. | ||
tenantcostclient.TargetPeriodSetting.Override(ctx, &st.SV, time.Millisecond*500) | ||
|
||
params := base.TestServerArgs{ | ||
Settings: st, | ||
DisableDefaultTestTenant: true, | ||
} | ||
|
||
s, mainDB, _ := serverutils.StartServer(t, params) | ||
defer s.Stopper().Stop(ctx) | ||
sysDB := sqlutils.MakeSQLRunner(mainDB) | ||
|
||
tenantID := serverutils.TestTenantID() | ||
tenant1, tenantDB1 := serverutils.StartTenant(t, s, base.TestTenantArgs{ | ||
TenantID: tenantID, | ||
Settings: st, | ||
}) | ||
defer tenant1.Stopper().Stop(ctx) | ||
defer tenantDB1.Close() | ||
tdb := sqlutils.MakeSQLRunner(tenantDB1) | ||
tdb.Exec(t, "SET CLUSTER SETTING sql.stats.automatic_collection.enabled=false") | ||
tdb.Exec(t, "CREATE TABLE abcd (a INT, b INT, c INT, d INT, INDEX (a, b, c))") | ||
|
||
type testCase struct { | ||
sql string | ||
count int | ||
} | ||
testCases := []testCase{ | ||
{ // Insert statement | ||
sql: "INSERT INTO abcd (SELECT t%2, t%3, t, -t FROM generate_series(1,50000) g(t))", | ||
count: 1, | ||
}, | ||
{ // Point query | ||
sql: "SELECT a FROM abcd WHERE (a, b) = (1, 1)", | ||
count: 10, | ||
}, | ||
{ // Range query | ||
sql: "SELECT a FROM abcd WHERE (a, b) = (1, 1) AND c > 0 AND c < 10000", | ||
count: 10, | ||
}, | ||
{ // Aggregate | ||
sql: "SELECT count(*) FROM abcd", | ||
count: 10, | ||
}, | ||
{ // Distinct | ||
sql: "SELECT DISTINCT ON (a, b) * FROM abcd", | ||
count: 10, | ||
}, | ||
{ // Full table scan | ||
sql: "SELECT a FROM abcd", | ||
count: 10, | ||
}, | ||
{ // Lookup join | ||
sql: "SELECT a FROM (VALUES (1, 1), (0, 2)) v(x, y) INNER LOOKUP JOIN abcd ON (a, b) = (x, y)", | ||
count: 10, | ||
}, | ||
{ // Index join | ||
sql: "SELECT * FROM abcd WHERE (a, b) = (0, 0)", | ||
count: 10, | ||
}, | ||
{ // No kv IO, lots of network egress. | ||
sql: "SELECT 'deadbeef' FROM generate_series(1, 50000)", | ||
count: 10, | ||
}, | ||
} | ||
|
||
var err error | ||
var tenantEstimatedRUs int | ||
for _, tc := range testCases { | ||
for i := 0; i < tc.count; i++ { | ||
output := tdb.QueryStr(t, "EXPLAIN ANALYZE "+tc.sql) | ||
var estimatedRU int | ||
for _, row := range output { | ||
if len(row) != 1 { | ||
t.Fatalf("expected one column") | ||
} | ||
val := row[0] | ||
if strings.Contains(val, "estimated RUs consumed") { | ||
substr := strings.Split(val, " ") | ||
require.Equalf(t, 4, len(substr), "expected RU consumption message to have four words") | ||
ruCountStr := strings.Replace(strings.TrimSpace(substr[3]), ",", "", -1) | ||
estimatedRU, err = strconv.Atoi(ruCountStr) | ||
require.NoError(t, err, "failed to retrieve estimated RUs") | ||
break | ||
} | ||
} | ||
tenantEstimatedRUs += estimatedRU | ||
} | ||
} | ||
|
||
getTenantRUs := func() float64 { | ||
// Sleep to ensure the measured RU consumption gets recorded in the | ||
// tenant_usage table. | ||
time.Sleep(time.Second) | ||
var consumptionBytes []byte | ||
var consumption roachpb.TenantConsumption | ||
var tenantRUs float64 | ||
rows := sysDB.Query(t, | ||
fmt.Sprintf( | ||
"SELECT total_consumption FROM system.tenant_usage WHERE tenant_id = %d AND instance_id = 0", | ||
tenantID.ToUint64(), | ||
), | ||
) | ||
for rows.Next() { | ||
require.NoError(t, rows.Scan(&consumptionBytes)) | ||
if len(consumptionBytes) == 0 { | ||
continue | ||
} | ||
require.NoError(t, protoutil.Unmarshal(consumptionBytes, &consumption)) | ||
tenantRUs += consumption.RU | ||
} | ||
return tenantRUs | ||
} | ||
tenantStartRUs := getTenantRUs() | ||
|
||
var tenantMeasuredRUs float64 | ||
for _, tc := range testCases { | ||
for i := 0; i < tc.count; i++ { | ||
tdb.QueryStr(t, tc.sql) | ||
} | ||
} | ||
|
||
// Check the estimated RU aggregate for all the queries against the actual | ||
// measured RU consumption for the tenant. | ||
tenantMeasuredRUs = getTenantRUs() - tenantStartRUs | ||
const deltaFraction = 0.25 | ||
allowedDelta := tenantMeasuredRUs * deltaFraction | ||
require.InDeltaf(t, tenantMeasuredRUs, tenantEstimatedRUs, allowedDelta, | ||
"estimated RUs (%d) were not within %f RUs of the expected value (%f)", | ||
tenantEstimatedRUs, | ||
allowedDelta, | ||
tenantMeasuredRUs, | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") | ||
load("@io_bazel_rules_go//go:def.bzl", "go_library") | ||
|
||
go_library( | ||
name = "multitenantcpu", | ||
srcs = ["cpu_usage.go"], | ||
importpath = "github.com/cockroachdb/cockroach/pkg/multitenant/multitenantcpu", | ||
visibility = ["//visibility:public"], | ||
deps = [ | ||
"//pkg/multitenant", | ||
"//pkg/server/status", | ||
"//pkg/util/log", | ||
"//pkg/util/timeutil", | ||
], | ||
) | ||
|
||
get_x_data(name = "get_x_data") |
Oops, something went wrong.