From 93a1fdc1f4b988c93d1f28681c890165eda3a4b3 Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Wed, 5 Jun 2019 09:31:31 +0200 Subject: [PATCH 1/2] sql: fix the retry counter in the statement statistics The statement statistics contain a column `max_retries` intended to show the maximum number of times a statement was retried. Due to a bug, this counter was previously always zero, *even for statements that were actually retried*. We hadn't noticed because there was no test to ensure that the value would increase. This patch fixes the bugs and verifies that the value does indeed increase. Release note (bug fix): The `max_retries` column in the statement statistics now properly displays the maximum number of times a statement was retried. --- pkg/sql/conn_executor.go | 3 +-- .../testdata/logic_test/crdb_internal | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index d3fba598ca42..fa039eacee4c 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -1024,8 +1024,6 @@ func (ex *connExecutor) resetExtraTxnState( ex.extraTxnState.tables.databaseCache = dbCacheHolder.getDatabaseCache() - ex.extraTxnState.autoRetryCounter = 0 - // Close all portals. for name, p := range ex.extraTxnState.prepStmtsNamespace.portals { p.decRef(ctx) @@ -1930,6 +1928,7 @@ func (ex *connExecutor) txnStateTransitionsApplyWrapper( switch advInfo.txnEvent { case noEvent: case txnStart: + ex.extraTxnState.autoRetryCounter = 0 case txnCommit: if res.Err() != nil { err := errorutil.UnexpectedWithIssueErrorf( diff --git a/pkg/sql/logictest/testdata/logic_test/crdb_internal b/pkg/sql/logictest/testdata/logic_test/crdb_internal index 784249ccdd5a..ae0047404ad2 100644 --- a/pkg/sql/logictest/testdata/logic_test/crdb_internal +++ b/pkg/sql/logictest/testdata/logic_test/crdb_internal @@ -413,3 +413,29 @@ query T SELECT crdb_internal.pretty_key(e'\\xa82a00918ed9':::BYTES, (-5096189069466142898):::INT8); ---- /Table/32/???/9/6/81 + +subtest max_retry_counter +# Verify that the max_retry counter in statement stats actually increases with retries. + +statement ok +SET application_name = 'test_max_retry' + +statement OK +SELECT crdb_internal.force_retry('2ms'::INTERVAL) + +statement OK +RESET application_name + +# Note: two rows of output are expected: one for the statement that +# failed with a retry error, and one for the final retry attempt that +# succeeded without an error. +# Wel also expect the RESET statement to not have any retry. +query TBB +SELECT key, (max_retries > 1), flags LIKE '!%' AS f + FROM crdb_internal.node_statement_statistics + WHERE application_name = 'test_max_retry' +ORDER BY key, f +---- +SELECT crdb_internal.force_retry(_) true false +SELECT crdb_internal.force_retry(_) true true +SET application_name = DEFAULT false false From c9810588bd076cc286b61d6d56c504c0c4b2225e Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Wed, 5 Jun 2019 09:36:06 +0200 Subject: [PATCH 2/2] sql: display the nr of server-side auto retries in exec/audit logs As originally implemented, the logic that produces the statement execution logs and the audit logs produces one line of logging for every statement *executed by the server*. If a statement is retried automatically, there is one line of log per retry attempt. This level of detail is desirable for troubleshooting because it enables to analyze different things: - the details of the transaction during the retry failures (the text of the retry error contains useful troubleshooting details). - how long each individual attempt lasted (in the latency column); - if/when the number of rows affected changes (in the rows affected column; this is needed for auditing). However, this amount of detail introduces an element of surprise: a user who does not understand how to distinguish "work sent by the client" and "work executed by the server" (because, for example, traditional databases usually conflate the two), may be surprised by this behavior. For example, to count how many stastements were sent by a client, the user would need to be taught how to ignore those log entries that report a retry error, and possibly be taught about the internals of the execution machinery. This amount of effort is indesirable. To alleviate this effort, and generally increase the readability of the statement log files, this patch introduces an additional column at the end of each log line, containing the current value of the auto-retry counter. With this patch implemented, all the log lines with this value set to zero correspond to the work that was sent by a client, prior to any retry attempt. Release note (sql change): The logging format for statement execution (as enabled by `sql.trace.log_statement_execute`) and audit logs (as enabled by `ALTER TABLE .. EXPERIMENTAL AUDIT SET ...`) is extended with one column that reports internal execution retries performed automatically by the server. The value zero is logged for the first retry attempt and thus corresponds to statements directly receivd from a client. --- pkg/sql/conn_executor_exec.go | 4 +++- pkg/sql/exec_log.go | 34 +++++++++++++++++++--------------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/pkg/sql/conn_executor_exec.go b/pkg/sql/conn_executor_exec.go index 70a252cbb6d7..6511bc91b4b7 100644 --- a/pkg/sql/conn_executor_exec.go +++ b/pkg/sql/conn_executor_exec.go @@ -668,7 +668,9 @@ func (ex *connExecutor) dispatchToExecutionEngine( } } - defer func() { planner.maybeLogStatement(ctx, "exec", res.RowsAffected(), res.Err()) }() + defer func() { + planner.maybeLogStatement(ctx, "exec", ex.extraTxnState.autoRetryCounter, res.RowsAffected(), res.Err()) + }() planner.statsCollector.PhaseTimes()[plannerEndLogicalPlan] = timeutil.Now() ex.sessionTracing.TracePlanEnd(ctx, err) diff --git a/pkg/sql/exec_log.go b/pkg/sql/exec_log.go index 4d2099264746..81c7cd89426b 100644 --- a/pkg/sql/exec_log.go +++ b/pkg/sql/exec_log.go @@ -31,15 +31,15 @@ import ( // The log format is currently as follows: // // Example audit log line: -// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT * FROM ab" {} 123.45 12 OK -// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT nonexistent FROM ab" {} 0.123 12 ERROR +// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT * FROM ab" {} 123.45 12 OK 0 +// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT nonexistent FROM ab" {} 0.123 12 ERROR 0 // Example execution log: -// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {} "SELECT * FROM ab" {} 123.45 12 OK -// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {} "SELECT nonexistent FROM ab" {} 0.123 0 "column \"nonexistent\" not found" +// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {} "SELECT * FROM ab" {} 123.45 12 OK 0 +// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {} "SELECT nonexistent FROM ab" {} 0.123 0 "column \"nonexistent\" not found" 0 // // Explanation of fields: -// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT nonexistent FROM ab" {} 0.123 12 ERROR -// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT nonexistent FROM ab" {} 0.123 12 ERROR 0 +// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // \ .../ // '- prefix generated by CockroachDB's standard log package. Contains: // @@ -79,6 +79,8 @@ import ( // - the number of rows that were produced. For troubleshooting. // - the status of the query (OK for success, ERROR or full error // message upon error). Needed for auditing and troubleshooting. +// - the number of times the statement was retried automatically +// by the server so far. // logStatementsExecuteEnabled causes the Executor to log executed // statements and, if any, resulting errors. @@ -90,13 +92,15 @@ var logStatementsExecuteEnabled = settings.RegisterBoolSetting( // maybeLogStatement conditionally records the current statement // (p.curPlan) to the exec / audit logs. -func (p *planner) maybeLogStatement(ctx context.Context, lbl string, rows int, err error) { +func (p *planner) maybeLogStatement( + ctx context.Context, lbl string, numRetries, rows int, err error, +) { p.maybeLogStatementInternal( - ctx, lbl, rows, err, p.statsCollector.PhaseTimes()[sessionQueryReceived]) + ctx, lbl, numRetries, rows, err, p.statsCollector.PhaseTimes()[sessionQueryReceived]) } func (p *planner) maybeLogStatementInternal( - ctx context.Context, lbl string, rows int, err error, startTime time.Time, + ctx context.Context, lbl string, numRetries, rows int, err error, startTime time.Time, ) { // Note: if you find the code below crashing because p.execCfg == nil, // do not add a test "if p.execCfg == nil { do nothing }" ! @@ -152,18 +156,18 @@ func (p *planner) maybeLogStatementInternal( // Now log! if auditEventsDetected { logger := p.execCfg.AuditLogger - logger.Logf(ctx, "%s %q %s %q %s %.3f %d %s", - lbl, appName, logTrigger, stmtStr, plStr, age, rows, auditErrStr) + logger.Logf(ctx, "%s %q %s %q %s %.3f %d %s %d", + lbl, appName, logTrigger, stmtStr, plStr, age, rows, auditErrStr, numRetries) } if logExecuteEnabled { logger := p.execCfg.ExecLogger - logger.Logf(ctx, "%s %q %s %q %s %.3f %d %q", - lbl, appName, logTrigger, stmtStr, plStr, age, rows, execErrStr) + logger.Logf(ctx, "%s %q %s %q %s %.3f %d %q %d", + lbl, appName, logTrigger, stmtStr, plStr, age, rows, execErrStr, numRetries) } if logV { // Copy to the main log. - log.VEventf(ctx, 2, "%s %q %s %q %s %.3f %d %q", - lbl, appName, logTrigger, stmtStr, plStr, age, rows, execErrStr) + log.VEventf(ctx, 2, "%s %q %s %q %s %.3f %d %q %d", + lbl, appName, logTrigger, stmtStr, plStr, age, rows, execErrStr, numRetries) } }