Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
38028: sql: fix the auto-retry counter in stats + log it in statement/audit logs r=knz a=knz

### sql: fix the retry counter in the statement statistics

The statement statistics contain a column `max_retries` intended
to show the maximum number of times a statement was retried.

Due to a bug, this counter was previously always zero, *even for
statements that were actually retried*. We hadn't noticed because
there was no test to ensure that the value would increase.

This patch fixes the bugs and verifies that the value does indeed
increase.

Release note (bug fix): The `max_retries` column in the statement
statistics now properly displays the maximum number of times
a statement was retried.

### sql: display the nr of server-side auto retries in exec/audit logs

As originally implemented, the logic that produces the statement
execution logs and the audit logs produces one line of logging for
every statement *executed by the server*. If a statement is retried
automatically, there is one line of log per retry attempt.

This level of detail is desirable for troubleshooting because it
enables to analyze different things:

- the details of the transaction during the retry failures (the text
  of the retry error contains useful troubleshooting details).
- how long each individual attempt lasted (in the latency column);
- if/when the number of rows affected changes (in the rows affected
  column; this is needed for auditing).

However, this amount of detail introduces an element of surprise: a
user who does not understand how to distinguish "work sent by the
client" and "work executed by the server" (because, for example,
traditional databases usually conflate the two), may be surprised
by this behavior.

For example, to count how many stastements were sent by a client,
the user would need to be taught how to ignore those log entries
that report a retry error, and possibly be taught about the internals
of the execution machinery. This amount of effort is indesirable.

To alleviate this effort, and generally increase the readability of
the statement log files, this patch introduces an additional column at
the end of each log line, containing the current value of the
auto-retry counter.

With this patch implemented, all the log lines with this value set to
zero correspond to the work that was sent by a client, prior to any
retry attempt.

Release note (sql change): The logging format for statement
execution (as enabled by `sql.trace.log_statement_execute`) and audit
logs (as enabled by `ALTER TABLE .. EXPERIMENTAL AUDIT SET ...`) is
extended with one column that reports internal execution retries
performed automatically by the server. The value zero is logged for
the first retry attempt and thus corresponds to statements directly
receivd from a client.



Co-authored-by: Raphael 'kena' Poss <knz@cockroachlabs.com>
  • Loading branch information
craig[bot] and knz committed Jun 5, 2019
2 parents eb73c4c + c981058 commit ad21052
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 18 deletions.
3 changes: 1 addition & 2 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -1024,8 +1024,6 @@ func (ex *connExecutor) resetExtraTxnState(

ex.extraTxnState.tables.databaseCache = dbCacheHolder.getDatabaseCache()

ex.extraTxnState.autoRetryCounter = 0

// Close all portals.
for name, p := range ex.extraTxnState.prepStmtsNamespace.portals {
p.decRef(ctx)
Expand Down Expand Up @@ -1930,6 +1928,7 @@ func (ex *connExecutor) txnStateTransitionsApplyWrapper(
switch advInfo.txnEvent {
case noEvent:
case txnStart:
ex.extraTxnState.autoRetryCounter = 0
case txnCommit:
if res.Err() != nil {
err := errorutil.UnexpectedWithIssueErrorf(
Expand Down
4 changes: 3 additions & 1 deletion pkg/sql/conn_executor_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,9 @@ func (ex *connExecutor) dispatchToExecutionEngine(
}
}

defer func() { planner.maybeLogStatement(ctx, "exec", res.RowsAffected(), res.Err()) }()
defer func() {
planner.maybeLogStatement(ctx, "exec", ex.extraTxnState.autoRetryCounter, res.RowsAffected(), res.Err())
}()

planner.statsCollector.PhaseTimes()[plannerEndLogicalPlan] = timeutil.Now()
ex.sessionTracing.TracePlanEnd(ctx, err)
Expand Down
34 changes: 19 additions & 15 deletions pkg/sql/exec_log.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ import (
// The log format is currently as follows:
//
// Example audit log line:
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT * FROM ab" {} 123.45 12 OK
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT nonexistent FROM ab" {} 0.123 12 ERROR
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT * FROM ab" {} 123.45 12 OK 0
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT nonexistent FROM ab" {} 0.123 12 ERROR 0
// Example execution log:
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {} "SELECT * FROM ab" {} 123.45 12 OK
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {} "SELECT nonexistent FROM ab" {} 0.123 0 "column \"nonexistent\" not found"
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {} "SELECT * FROM ab" {} 123.45 12 OK 0
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {} "SELECT nonexistent FROM ab" {} 0.123 0 "column \"nonexistent\" not found" 0
//
// Explanation of fields:
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT nonexistent FROM ab" {} 0.123 12 ERROR
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// I180211 07:30:48.832004 317 sql/exec_log.go:90 [client=127.0.0.1:62503,user=root,n1] 13 exec "cockroach" {"ab"[53]:READ} "SELECT nonexistent FROM ab" {} 0.123 12 ERROR 0
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// \ .../
// '- prefix generated by CockroachDB's standard log package. Contains:
//
Expand Down Expand Up @@ -79,6 +79,8 @@ import (
// - the number of rows that were produced. For troubleshooting.
// - the status of the query (OK for success, ERROR or full error
// message upon error). Needed for auditing and troubleshooting.
// - the number of times the statement was retried automatically
// by the server so far.

// logStatementsExecuteEnabled causes the Executor to log executed
// statements and, if any, resulting errors.
Expand All @@ -90,13 +92,15 @@ var logStatementsExecuteEnabled = settings.RegisterBoolSetting(

// maybeLogStatement conditionally records the current statement
// (p.curPlan) to the exec / audit logs.
func (p *planner) maybeLogStatement(ctx context.Context, lbl string, rows int, err error) {
func (p *planner) maybeLogStatement(
ctx context.Context, lbl string, numRetries, rows int, err error,
) {
p.maybeLogStatementInternal(
ctx, lbl, rows, err, p.statsCollector.PhaseTimes()[sessionQueryReceived])
ctx, lbl, numRetries, rows, err, p.statsCollector.PhaseTimes()[sessionQueryReceived])
}

func (p *planner) maybeLogStatementInternal(
ctx context.Context, lbl string, rows int, err error, startTime time.Time,
ctx context.Context, lbl string, numRetries, rows int, err error, startTime time.Time,
) {
// Note: if you find the code below crashing because p.execCfg == nil,
// do not add a test "if p.execCfg == nil { do nothing }" !
Expand Down Expand Up @@ -152,18 +156,18 @@ func (p *planner) maybeLogStatementInternal(
// Now log!
if auditEventsDetected {
logger := p.execCfg.AuditLogger
logger.Logf(ctx, "%s %q %s %q %s %.3f %d %s",
lbl, appName, logTrigger, stmtStr, plStr, age, rows, auditErrStr)
logger.Logf(ctx, "%s %q %s %q %s %.3f %d %s %d",
lbl, appName, logTrigger, stmtStr, plStr, age, rows, auditErrStr, numRetries)
}
if logExecuteEnabled {
logger := p.execCfg.ExecLogger
logger.Logf(ctx, "%s %q %s %q %s %.3f %d %q",
lbl, appName, logTrigger, stmtStr, plStr, age, rows, execErrStr)
logger.Logf(ctx, "%s %q %s %q %s %.3f %d %q %d",
lbl, appName, logTrigger, stmtStr, plStr, age, rows, execErrStr, numRetries)
}
if logV {
// Copy to the main log.
log.VEventf(ctx, 2, "%s %q %s %q %s %.3f %d %q",
lbl, appName, logTrigger, stmtStr, plStr, age, rows, execErrStr)
log.VEventf(ctx, 2, "%s %q %s %q %s %.3f %d %q %d",
lbl, appName, logTrigger, stmtStr, plStr, age, rows, execErrStr, numRetries)
}
}

Expand Down
26 changes: 26 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/crdb_internal
Original file line number Diff line number Diff line change
Expand Up @@ -413,3 +413,29 @@ query T
SELECT crdb_internal.pretty_key(e'\\xa82a00918ed9':::BYTES, (-5096189069466142898):::INT8);
----
/Table/32/???/9/6/81

subtest max_retry_counter
# Verify that the max_retry counter in statement stats actually increases with retries.

statement ok
SET application_name = 'test_max_retry'

statement OK
SELECT crdb_internal.force_retry('2ms'::INTERVAL)

statement OK
RESET application_name

# Note: two rows of output are expected: one for the statement that
# failed with a retry error, and one for the final retry attempt that
# succeeded without an error.
# Wel also expect the RESET statement to not have any retry.
query TBB
SELECT key, (max_retries > 1), flags LIKE '!%' AS f
FROM crdb_internal.node_statement_statistics
WHERE application_name = 'test_max_retry'
ORDER BY key, f
----
SELECT crdb_internal.force_retry(_) true false
SELECT crdb_internal.force_retry(_) true true
SET application_name = DEFAULT false false

0 comments on commit ad21052

Please sign in to comment.