Skip to content

Commit

Permalink
sql: add infrastructure for tracing queries on demand
Browse files Browse the repository at this point in the history
This patch adds some infrastructure, to be hooked up to the AdminUI in
the future. The system.statement_diagnostics_requests and
system.statement_diagnostics tables are introduced. The idea is that all
nodes poll the former for fingerprints of requests that need information
collected and, when nodes then see a query with such a fingerprint, they
race to write the requested info (initially, a trace) into the latter.

system.statement_bundle_chunks is also introduced, to be used in the
future for gatherig more generic data about a query.

Release note: None
  • Loading branch information
RaduBerinde authored and andreimatei committed Feb 27, 2020
1 parent 55e4b23 commit 10d989f
Show file tree
Hide file tree
Showing 35 changed files with 3,283 additions and 798 deletions.
793 changes: 653 additions & 140 deletions c-deps/libroach/protos/util/tracing/recorded_span.pb.cc

Large diffs are not rendered by default.

786 changes: 610 additions & 176 deletions c-deps/libroach/protos/util/tracing/recorded_span.pb.h

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,6 @@
<tr><td><code>trace.debug.enable</code></td><td>boolean</td><td><code>false</code></td><td>if set, traces for recent requests can be seen in the /debug page</td></tr>
<tr><td><code>trace.lightstep.token</code></td><td>string</td><td><code></code></td><td>if set, traces go to Lightstep using this token</td></tr>
<tr><td><code>trace.zipkin.collector</code></td><td>string</td><td><code></code></td><td>if set, traces go to the given Zipkin instance (example: '127.0.0.1:9411'); ignored if trace.lightstep.token is set</td></tr>
<tr><td><code>version</code></td><td>custom validation</td><td><code>19.2-13</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
<tr><td><code>version</code></td><td>custom validation</td><td><code>19.2-14</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
</tbody>
</table>
30 changes: 24 additions & 6 deletions pkg/cli/zip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ requesting heap profile for node 1... writing: debug/nodes/1/heap.pprof
requesting heap files for node 1... 0 found
requesting goroutine files for node 1... 0 found
requesting log file ...
requesting ranges... 29 found
requesting ranges... 32 found
writing: debug/nodes/1/ranges/1.json
writing: debug/nodes/1/ranges/2.json
writing: debug/nodes/1/ranges/3.json
Expand Down Expand Up @@ -194,13 +194,16 @@ writing: debug/nodes/1/ranges/26.json
writing: debug/nodes/1/ranges/27.json
writing: debug/nodes/1/ranges/28.json
writing: debug/nodes/1/ranges/29.json
writing: debug/nodes/1/ranges/30.json
writing: debug/nodes/1/ranges/31.json
writing: debug/nodes/1/ranges/32.json
requesting list of SQL databases... 3 found
requesting database details for defaultdb... writing: debug/schema/defaultdb@details.json
0 tables found
requesting database details for postgres... writing: debug/schema/postgres@details.json
0 tables found
requesting database details for system... writing: debug/schema/system@details.json
23 tables found
26 tables found
requesting table details for system.comments... writing: debug/schema/system/comments.json
requesting table details for system.descriptor... writing: debug/schema/system/descriptor.json
requesting table details for system.eventlog... writing: debug/schema/system/eventlog.json
Expand All @@ -219,6 +222,9 @@ requesting table details for system.reports_meta... writing: debug/schema/system
requesting table details for system.role_members... writing: debug/schema/system/role_members.json
requesting table details for system.role_options... writing: debug/schema/system/role_options.json
requesting table details for system.settings... writing: debug/schema/system/settings.json
requesting table details for system.statement_bundle_chunks... writing: debug/schema/system/statement_bundle_chunks.json
requesting table details for system.statement_diagnostics... writing: debug/schema/system/statement_diagnostics.json
requesting table details for system.statement_diagnostics_requests... writing: debug/schema/system/statement_diagnostics_requests.json
requesting table details for system.table_statistics... writing: debug/schema/system/table_statistics.json
requesting table details for system.ui... writing: debug/schema/system/ui.json
requesting table details for system.users... writing: debug/schema/system/users.json
Expand Down Expand Up @@ -284,7 +290,7 @@ requesting table details for defaultdb.pg_catalog.pg_class... writing: debug/sch
requesting database details for postgres... writing: debug/schema/postgres@details.json
0 tables found
requesting database details for system... writing: debug/schema/system-1@details.json
23 tables found
26 tables found
requesting table details for system.comments... writing: debug/schema/system-1/comments.json
requesting table details for system.descriptor... writing: debug/schema/system-1/descriptor.json
requesting table details for system.eventlog... writing: debug/schema/system-1/eventlog.json
Expand All @@ -303,6 +309,9 @@ requesting table details for system.reports_meta... writing: debug/schema/system
requesting table details for system.role_members... writing: debug/schema/system-1/role_members.json
requesting table details for system.role_options... writing: debug/schema/system-1/role_options.json
requesting table details for system.settings... writing: debug/schema/system-1/settings.json
requesting table details for system.statement_bundle_chunks... writing: debug/schema/system-1/statement_bundle_chunks.json
requesting table details for system.statement_diagnostics... writing: debug/schema/system-1/statement_diagnostics.json
requesting table details for system.statement_diagnostics_requests... writing: debug/schema/system-1/statement_diagnostics_requests.json
requesting table details for system.table_statistics... writing: debug/schema/system-1/table_statistics.json
requesting table details for system.ui... writing: debug/schema/system-1/ui.json
requesting table details for system.users... writing: debug/schema/system-1/users.json
Expand Down Expand Up @@ -561,7 +570,7 @@ requesting heap profile for node 1... writing: debug/nodes/1/heap.pprof
requesting heap files for node 1... 0 found
requesting goroutine files for node 1... 0 found
requesting log file ...
requesting ranges... 29 found
requesting ranges... 32 found
writing: debug/nodes/1/ranges/1.json
writing: debug/nodes/1/ranges/2.json
writing: debug/nodes/1/ranges/3.json
Expand Down Expand Up @@ -591,6 +600,9 @@ writing: debug/nodes/1/ranges/26.json
writing: debug/nodes/1/ranges/27.json
writing: debug/nodes/1/ranges/28.json
writing: debug/nodes/1/ranges/29.json
writing: debug/nodes/1/ranges/30.json
writing: debug/nodes/1/ranges/31.json
writing: debug/nodes/1/ranges/32.json
writing: debug/nodes/2/status.json
using SQL connection URL for node 2: postgresql://...
retrieving SQL data for crdb_internal.feature_usage... writing: debug/nodes/2/crdb_internal.feature_usage.txt
Expand Down Expand Up @@ -660,7 +672,7 @@ requesting heap profile for node 3... writing: debug/nodes/3/heap.pprof
requesting heap files for node 3... 0 found
requesting goroutine files for node 3... 0 found
requesting log file ...
requesting ranges... 29 found
requesting ranges... 32 found
writing: debug/nodes/3/ranges/1.json
writing: debug/nodes/3/ranges/2.json
writing: debug/nodes/3/ranges/3.json
Expand Down Expand Up @@ -690,13 +702,16 @@ writing: debug/nodes/3/ranges/26.json
writing: debug/nodes/3/ranges/27.json
writing: debug/nodes/3/ranges/28.json
writing: debug/nodes/3/ranges/29.json
writing: debug/nodes/3/ranges/30.json
writing: debug/nodes/3/ranges/31.json
writing: debug/nodes/3/ranges/32.json
requesting list of SQL databases... 3 found
requesting database details for defaultdb... writing: debug/schema/defaultdb@details.json
0 tables found
requesting database details for postgres... writing: debug/schema/postgres@details.json
0 tables found
requesting database details for system... writing: debug/schema/system@details.json
23 tables found
26 tables found
requesting table details for system.comments... writing: debug/schema/system/comments.json
requesting table details for system.descriptor... writing: debug/schema/system/descriptor.json
requesting table details for system.eventlog... writing: debug/schema/system/eventlog.json
Expand All @@ -715,6 +730,9 @@ requesting table details for system.reports_meta... writing: debug/schema/system
requesting table details for system.role_members... writing: debug/schema/system/role_members.json
requesting table details for system.role_options... writing: debug/schema/system/role_options.json
requesting table details for system.settings... writing: debug/schema/system/settings.json
requesting table details for system.statement_bundle_chunks... writing: debug/schema/system/statement_bundle_chunks.json
requesting table details for system.statement_diagnostics... writing: debug/schema/system/statement_diagnostics.json
requesting table details for system.statement_diagnostics_requests... writing: debug/schema/system/statement_diagnostics_requests.json
requesting table details for system.table_statistics... writing: debug/schema/system/table_statistics.json
requesting table details for system.ui... writing: debug/schema/system/ui.json
requesting table details for system.users... writing: debug/schema/system/users.json
Expand Down
7 changes: 7 additions & 0 deletions pkg/gossip/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ const (
// client connections a node has open. This is used by other nodes in the
// cluster to build a map of the gossip network.
KeyGossipClientsPrefix = "gossip-clients"

// KeyGossipStatementDiagnosticsRequest is the gossip key for new statement
// diagnostics requests. The values is the id of the request that generated
// the notification, as a little-endian-encoded uint64.
// stmtDiagnosticsRequestRegistry listens for notifications and responds by
// polling for new requests.
KeyGossipStatementDiagnosticsRequest = "stmt-diag-req"
)

// MakeKey creates a canonical key under which to gossip a piece of
Expand Down
4 changes: 4 additions & 0 deletions pkg/keys/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,10 @@ const (

RoleOptionsTableID = 33

StatementBundleChunksTableID = 34
StatementDiagnosticsRequestsTableID = 35
StatementDiagnosticsTableID = 36

// CommentType is type for system.comments
DatabaseCommentType = 0
TableCommentType = 1
Expand Down
2 changes: 1 addition & 1 deletion pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
)
s.internalExecutor = internalExecutor
execCfg.InternalExecutor = internalExecutor

s.status.stmtDiagnosticsRequester = execCfg.NewStmtDiagnosticsRequestRegistry()
s.execCfg = &execCfg

s.leaseMgr.SetInternalExecutor(execCfg.InternalExecutor)
Expand Down
27 changes: 14 additions & 13 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,20 @@ func propagateGatewayMetadata(ctx context.Context) context.Context {
type statusServer struct {
log.AmbientContext

st *cluster.Settings
cfg *base.Config
admin *adminServer
db *client.DB
gossip *gossip.Gossip
metricSource metricMarshaler
nodeLiveness *storage.NodeLiveness
storePool *storage.StorePool
rpcCtx *rpc.Context
stores *storage.Stores
stopper *stop.Stopper
sessionRegistry *sql.SessionRegistry
si systemInfoOnce
st *cluster.Settings
cfg *base.Config
admin *adminServer
db *client.DB
gossip *gossip.Gossip
metricSource metricMarshaler
nodeLiveness *storage.NodeLiveness
storePool *storage.StorePool
rpcCtx *rpc.Context
stores *storage.Stores
stopper *stop.Stopper
sessionRegistry *sql.SessionRegistry
si systemInfoOnce
stmtDiagnosticsRequester sql.StmtDiagnosticsRequester
}

// newStatusServer allocates and returns a statusServer.
Expand Down
5 changes: 5 additions & 0 deletions pkg/server/testserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,11 @@ func (ts *TestServer) DistSender() *kv.DistSender {
return ts.DistSenderI().(*kv.DistSender)
}

// SQLServer is part of TestServerInterface.
func (ts *TestServer) SQLServer() interface{} {
return ts.PGServer().SQLServer
}

// DistSQLServer is part of TestServerInterface.
func (ts *TestServer) DistSQLServer() interface{} {
return ts.distSQLServer
Expand Down
10 changes: 10 additions & 0 deletions pkg/settings/cluster/cockroach_versions.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ const (
VersionNoExplicitForeignKeyIndexIDs
VersionHashShardedIndexes
VersionCreateRolePrivilege
VersionStatementDiagnosticsSystemTables

// Add new versions here (step one of two).
)
Expand Down Expand Up @@ -417,6 +418,15 @@ var versionsSingleton = keyedVersions([]keyedVersion{
Key: VersionCreateRolePrivilege,
Version: roachpb.Version{Major: 19, Minor: 2, Unstable: 13},
},
{
// VersionStatementDiagnosticsSystemTables introduces the system tables for
// storing statement information (like traces, bundles).
// In this version and later the system.statement_diagnostics_requests,
// system.statement_diagnostics and system.statement_bundle_chunks tables
// are part of the system bootstap schema.
Key: VersionStatementDiagnosticsSystemTables,
Version: roachpb.Version{Major: 19, Minor: 2, Unstable: 14},
},
// Add new versions here (step two of two).

})
Expand Down
5 changes: 3 additions & 2 deletions pkg/settings/cluster/versionkey_string.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ func (s *Server) Start(ctx context.Context, stopper *stop.Stopper) {
s.PeriodicallyClearSQLStats(ctx, stopper, maxSQLStatReset, &s.reportedStats)
// Start a second loop to clear SQL stats at the requested interval.
s.PeriodicallyClearSQLStats(ctx, stopper, sqlStatReset, &s.sqlStats)

s.PeriodicallyPollForStatementInfoRequests(ctx, stopper)
}

// ResetSQLStats resets the executor's collected sql statistics.
Expand Down Expand Up @@ -599,6 +601,7 @@ func (s *Server) newConnExecutor(
ctxHolder: ctxHolder{connCtx: ctx},
executorType: executorTypeExec,
hasCreatedTemporarySchema: false,
stmtInfoRegistry: s.cfg.stmtInfoRequestRegistry,
}

ex.state.txnAbortCount = ex.metrics.EngineMetrics.TxnAbortCount
Expand Down Expand Up @@ -775,6 +778,30 @@ func (s *Server) PeriodicallyClearSQLStats(
})
}

// PeriodicallyPollForStatementInfoRequests runs a worker that periodically
// polls system.statement_diagnostics_requests.
func (s *Server) PeriodicallyPollForStatementInfoRequests(
ctx context.Context, stopper *stop.Stopper,
) {
pollingInterval := 10 * time.Second
stopper.RunWorker(ctx, func(ctx context.Context) {
ctx, _ = stopper.WithCancelOnQuiesce(ctx)
var timer timeutil.Timer
for {
if err := s.cfg.stmtInfoRequestRegistry.pollRequests(ctx); err != nil {
log.Warningf(ctx, "error polling for statement diagnostics requests: %s", err)
}
timer.Reset(pollingInterval)
select {
case <-stopper.ShouldQuiesce():
return
case <-timer.C:
timer.Read = true
}
}
})
}

type closeType int

const (
Expand Down Expand Up @@ -1067,6 +1094,10 @@ type connExecutor struct {
// hasCreatedTemporarySchema is set if the executor has created a
// temporary schema, which requires special cleanup on close.
hasCreatedTemporarySchema bool

// stmtInfoRequestRegistry is used to track which queries need to have
// information collected.
stmtInfoRegistry *stmtDiagnosticsRequestRegistry
}

// ctxHolder contains a connection's context and, while session tracing is
Expand Down
17 changes: 17 additions & 0 deletions pkg/sql/conn_executor_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/errors"
"github.com/opentracing/opentracing-go"
)

// execStmt executes one statement by dispatching according to the current
Expand Down Expand Up @@ -179,6 +180,21 @@ func (ex *connExecutor) execStmtInOpenState(
}
}()

shouldCollectInfo, recordStmtInfoFn := ex.stmtInfoRegistry.shouldCollectDiagnostics(ctx, stmt.AST)
if shouldCollectInfo {
tr := ex.server.cfg.AmbientCtx.Tracer
origCtx := ctx
var sp opentracing.Span
ctx, sp = tracing.StartSnowballTrace(ctx, tr, "traced statement")
defer func() {
// Record the statement information that we've collected.
// Note that in case of implicit transactions, the trace contains the auto-commit too.
sp.Finish()
trace := tracing.GetRecording(sp)
recordStmtInfoFn(origCtx, trace)
}()
}

if ex.sessionData.StmtTimeout > 0 {
timeoutTicker = time.AfterFunc(
ex.sessionData.StmtTimeout-timeutil.Since(ex.phaseTimes[sessionQueryReceived]),
Expand Down Expand Up @@ -1146,6 +1162,7 @@ func (ex *connExecutor) handleAutoCommit(
) (fsm.Event, fsm.EventPayload) {
txn := ex.state.mu.txn
if txn.IsCommitted() {
log.Event(ctx, "statement execution committed the txn")
return eventTxnFinish{}, eventTxnFinishPayload{commit: true}
}

Expand Down
5 changes: 3 additions & 2 deletions pkg/sql/conn_executor_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,9 @@ func startConnExecutor(
dummyLivenessProvider{}, /* liveness */
nil, /* nodeDialer */
),
QueryCache: querycache.New(0),
TestingKnobs: ExecutorTestingKnobs{},
QueryCache: querycache.New(0),
TestingKnobs: ExecutorTestingKnobs{},
stmtInfoRequestRegistry: newStmtDiagnosticsRequestRegistry(nil, nil, nil, 0),
}
pool := mon.MakeUnlimitedMonitor(
context.Background(), "test", mon.MemoryResource,
Expand Down
19 changes: 16 additions & 3 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -569,11 +569,24 @@ type ExecutorConfig struct {

// ProtectedTimestampProvider encapsulates the protected timestamp subsystem.
ProtectedTimestampProvider protectedts.Provider

stmtInfoRequestRegistry *stmtDiagnosticsRequestRegistry
}

// Organization returns the value of cluster.organization.
func (ec *ExecutorConfig) Organization() string {
return ClusterOrganization.Get(&ec.Settings.SV)
func (cfg *ExecutorConfig) Organization() string {
return ClusterOrganization.Get(&cfg.Settings.SV)
}

// NewStmtDiagnosticsRequestRegistry initializes cfg.stmtInfoRequestRegistry and
// returns it as the publicly-accessible StmtDiagnosticsRequester.
func (cfg *ExecutorConfig) NewStmtDiagnosticsRequestRegistry() StmtDiagnosticsRequester {
if cfg.InternalExecutor == nil {
panic("cfg.InternalExecutor not initialized")
}
cfg.stmtInfoRequestRegistry = newStmtDiagnosticsRequestRegistry(
cfg.InternalExecutor, cfg.DB, cfg.Gossip, cfg.NodeID.Get())
return cfg.stmtInfoRequestRegistry
}

var _ base.ModuleTestingKnobs = &ExecutorTestingKnobs{}
Expand Down Expand Up @@ -1490,7 +1503,7 @@ func (st *SessionTracing) TraceExecEnd(ctx context.Context, err error, count int

// extractMsgFromRecord extracts the message of the event, which is either in an
// "event" or "error" field.
func extractMsgFromRecord(rec tracing.RecordedSpan_LogRecord) string {
func extractMsgFromRecord(rec tracing.LogRecord) string {
for _, f := range rec.Fields {
key := f.Key
if key == "event" {
Expand Down
Loading

0 comments on commit 10d989f

Please sign in to comment.