-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
multitenantccl: add sanity testing for ru estimation
This commit adds a sanity test for the RU estimates produced by running queries with `EXPLAIN ANALYZE` on a tenant. The test runs each test query several times, ensuring that the variance of the estimates does not exceed 5% of the mean. It then runs all test queries without `EXPLAIN ANALYZE` and compares the resulting actual RU measurement to the aggregated estimates. Release note: None
- Loading branch information
1 parent
1543bb0
commit 29969e8
Showing
3 changed files
with
218 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
209 changes: 209 additions & 0 deletions
209
pkg/ccl/multitenantccl/tenantcostclient/query_ru_estimate_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
package tenantcostclient_test | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"math" | ||
"strconv" | ||
"strings" | ||
"testing" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/base" | ||
_ "github.com/cockroachdb/cockroach/pkg/ccl" // ccl init hooks | ||
_ "github.com/cockroachdb/cockroach/pkg/ccl/kvccl/kvtenantccl" | ||
"github.com/cockroachdb/cockroach/pkg/ccl/multitenantccl/tenantcostclient" | ||
_ "github.com/cockroachdb/cockroach/pkg/ccl/multitenantccl/tenantcostserver" | ||
"github.com/cockroachdb/cockroach/pkg/roachpb" | ||
"github.com/cockroachdb/cockroach/pkg/settings/cluster" | ||
"github.com/cockroachdb/cockroach/pkg/sql/stats" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/skip" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" | ||
"github.com/cockroachdb/cockroach/pkg/util/leaktest" | ||
"github.com/cockroachdb/cockroach/pkg/util/log" | ||
"github.com/cockroachdb/cockroach/pkg/util/protoutil" | ||
stats2 "github.com/montanaflynn/stats" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
// TestEstimateQueryRUConsumption is a sanity check for the RU estimates | ||
// produced for queries that are run by a tenant under EXPLAIN ANALYZE. The RU | ||
// consumption of a query is not deterministic, since it depends on inexact | ||
// quantities like the (already estimated) CPU usage and the memory size of the | ||
// results returned to the client. Therefore, the test runs each query multiple | ||
// times and ensures that the variance is within a limit, and then checks that | ||
// the total estimated RU consumption is within reasonable distance from the | ||
// actual measured RUs for the tenant. | ||
func TestEstimateQueryRUConsumption(t *testing.T) { | ||
defer leaktest.AfterTest(t)() | ||
defer log.Scope(t).Close(t) | ||
skip.UnderStress(t, "the test is sensitive to background activity and may fail under stress") | ||
skip.UnderShort(t) | ||
|
||
ctx := context.Background() | ||
|
||
st := cluster.MakeTestingClusterSettings() | ||
stats.AutomaticStatisticsClusterMode.Override(ctx, &st.SV, false) | ||
stats.UseStatisticsOnSystemTables.Override(ctx, &st.SV, false) | ||
stats.AutomaticStatisticsOnSystemTables.Override(ctx, &st.SV, false) | ||
|
||
// Lower the target duration for reporting tenant usage so that it can be | ||
// measured accurately. Avoid decreasing too far, since doing so can add | ||
// measurable overhead. | ||
tenantcostclient.TargetPeriodSetting.Override(ctx, &st.SV, time.Millisecond*500) | ||
|
||
params := base.TestServerArgs{ | ||
Settings: st, | ||
DisableDefaultTestTenant: true, | ||
} | ||
|
||
params.DisableDefaultTestTenant = true | ||
s, mainDB, _ := serverutils.StartServer(t, params) | ||
defer s.Stopper().Stop(ctx) | ||
sysDB := sqlutils.MakeSQLRunner(mainDB) | ||
|
||
tenantID := serverutils.TestTenantID() | ||
tenant1, tenantDB1 := serverutils.StartTenant(t, s, base.TestTenantArgs{ | ||
TenantID: tenantID, | ||
Settings: st, | ||
}) | ||
defer tenant1.Stopper().Stop(ctx) | ||
defer tenantDB1.Close() | ||
tdb := sqlutils.MakeSQLRunner(tenantDB1) | ||
tdb.Exec(t, "SET CLUSTER SETTING sql.stats.automatic_collection.enabled=false") | ||
tdb.Exec(t, "CREATE TABLE abcd (a INT, b INT, c INT, d INT, INDEX (a, b, c))") | ||
tdb.Exec(t, "INSERT INTO abcd (SELECT t%2, t%3, t, -t FROM generate_series(1,100000) g(t))") | ||
|
||
type testCase struct { | ||
sql string | ||
count int | ||
} | ||
testCases := []testCase{ | ||
{ // Point query | ||
sql: "SELECT a FROM abcd WHERE (a, b) = (1, 1)", | ||
count: 10, | ||
}, | ||
{ // Range query | ||
sql: "SELECT a FROM abcd WHERE (a, b) = (1, 1) AND c > 0 AND c < 10000", | ||
count: 10, | ||
}, | ||
{ // Aggregate | ||
sql: "SELECT count(*) FROM abcd", | ||
count: 10, | ||
}, | ||
{ // Distinct | ||
sql: "SELECT DISTINCT ON (a, b) * FROM abcd", | ||
count: 10, | ||
}, | ||
{ // Full table scan | ||
sql: "SELECT a FROM abcd", | ||
count: 10, | ||
}, | ||
{ // Lookup join | ||
sql: "SELECT a FROM (VALUES (1, 1), (0, 2)) v(x, y) INNER LOOKUP JOIN abcd ON (a, b) = (x, y)", | ||
count: 10, | ||
}, | ||
{ // Index join | ||
sql: "SELECT * FROM abcd WHERE (a, b) = (0, 0)", | ||
count: 10, | ||
}, | ||
{ // No kv IO, lots of network egress. | ||
sql: "SELECT 'deadbeef' FROM generate_series(1, 50000)", | ||
count: 10, | ||
}, | ||
} | ||
|
||
var err error | ||
var tenantEstimatedRUs int | ||
for tcNum, tc := range testCases { | ||
testCaseRUEstimates := make([]float64, tc.count) | ||
for i := 0; i < tc.count; i++ { | ||
output := tdb.QueryStr(t, "EXPLAIN ANALYZE "+tc.sql) | ||
var estimatedRU int | ||
for _, row := range output { | ||
if len(row) != 1 { | ||
t.Fatalf("expected one column") | ||
} | ||
val := row[0] | ||
if strings.Contains(val, "estimated RUs consumed") { | ||
substr := strings.Split(val, " ") | ||
if len(substr) == 4 { | ||
ruCountStr := strings.Replace(strings.TrimSpace(substr[3]), ",", "", -1) | ||
estimatedRU, err = strconv.Atoi(ruCountStr) | ||
require.NoError(t, err, "failed to retrieve estimated RUs") | ||
break | ||
} | ||
} | ||
} | ||
tenantEstimatedRUs += estimatedRU | ||
testCaseRUEstimates[i] = float64(estimatedRU) | ||
} | ||
var mean, variance float64 | ||
mean, err = stats2.Mean(testCaseRUEstimates) | ||
if mean == 0 { | ||
// Sufficiently cheap queries will return zero as the RU estimate. | ||
continue | ||
} | ||
require.NoError(t, err, "failed to calculate mean for test case %d", tcNum) | ||
variance, err = stats2.Variance(testCaseRUEstimates) | ||
require.NoError(t, err, "failed to calculate variance for test case %d", tcNum) | ||
stdDev := math.Sqrt(variance) | ||
const minAllowedStdDev = 10 | ||
const maxStdDevFraction = 0.05 | ||
maxAllowedStdDev := mean * maxStdDevFraction | ||
if maxAllowedStdDev < minAllowedStdDev { | ||
maxAllowedStdDev = minAllowedStdDev | ||
} | ||
require.Lessf(t, stdDev, maxAllowedStdDev, | ||
"standard deviation of RU estimates is %f%% of the mean RUs (%f) for test case %d", | ||
(stdDev/mean)*100, | ||
mean, | ||
tcNum, | ||
) | ||
} | ||
|
||
getTenantRUs := func() float64 { | ||
// Sleep to ensure the measured RU consumption gets recorded in the | ||
// tenant_usage table. | ||
time.Sleep(time.Second) | ||
var consumptionBytes []byte | ||
var consumption roachpb.TenantConsumption | ||
var tenantRUs float64 | ||
rows := sysDB.Query(t, | ||
fmt.Sprintf( | ||
"SELECT total_consumption FROM system.tenant_usage WHERE tenant_id = %d AND instance_id = 0", | ||
tenantID.ToUint64(), | ||
), | ||
) | ||
for rows.Next() { | ||
require.NoError(t, rows.Scan(&consumptionBytes)) | ||
if len(consumptionBytes) == 0 { | ||
continue | ||
} | ||
require.NoError(t, protoutil.Unmarshal(consumptionBytes, &consumption)) | ||
tenantRUs += consumption.RU | ||
} | ||
return tenantRUs | ||
} | ||
tenantStartRUs := getTenantRUs() | ||
|
||
var tenantMeasuredRUs float64 | ||
for _, tc := range testCases { | ||
for i := 0; i < tc.count; i++ { | ||
tdb.QueryStr(t, tc.sql) | ||
} | ||
} | ||
|
||
// Check the estimated RU aggregate for all the queries against the actual | ||
// measured RU consumption for the tenant. | ||
tenantMeasuredRUs = getTenantRUs() - tenantStartRUs | ||
const deltaFraction = 0.5 | ||
allowedDelta := tenantMeasuredRUs * deltaFraction | ||
require.InDeltaf(t, tenantMeasuredRUs, tenantEstimatedRUs, allowedDelta, | ||
"estimated RUs (%d) were not within %f RUs of the expected value (%f)", | ||
tenantEstimatedRUs, | ||
allowedDelta, | ||
tenantMeasuredRUs, | ||
) | ||
} |