From 0f2cc719cdc6e57cbcf4e6de3ff42e67a74b03de Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 17 Feb 2021 12:45:47 +0100 Subject: [PATCH 01/15] sql: work around contention registry errors See https://github.com/cockroachdb/cockroach/issues/60669. Release note: None --- pkg/sql/distsql_running.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/sql/distsql_running.go b/pkg/sql/distsql_running.go index 119119cb7ba0..854ca435563b 100644 --- a/pkg/sql/distsql_running.go +++ b/pkg/sql/distsql_running.go @@ -670,7 +670,9 @@ func (r *DistSQLReceiver) Push( r.contendedQueryMetric = nil } if err := r.contentionRegistry.AddContentionEvent(ev); err != nil { - r.resultWriter.SetError(errors.Wrap(err, "unable to add contention event to registry")) + // TODO(asubiotto): see https://github.com/cockroachdb/cockroach/issues/60669 + // r.resultWriter.SetError(errors.Wrap(err, "unable to add contention event to registry")) + _ = err } }) } From cfcc36cb474c52b1c0429827d5d548a22ed2c729 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 15 Feb 2021 12:42:15 +0100 Subject: [PATCH 02/15] tracing: fix accidental noop in ImportRemoteSpans There was vestigial code that would drop a received recording on the floor if the receiving Span wasn't verbose. We now want to ingest Recordings independently of that setting. Release note: None --- pkg/util/tracing/span.go | 3 --- pkg/util/tracing/span_test.go | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pkg/util/tracing/span.go b/pkg/util/tracing/span.go index 1ef06781e234..a6a31afae64d 100644 --- a/pkg/util/tracing/span.go +++ b/pkg/util/tracing/span.go @@ -156,9 +156,6 @@ func (s *Span) GetRecording() Recording { // these spans will be part of the result of GetRecording. Used to import // recorded traces from other nodes. func (s *Span) ImportRemoteSpans(remoteSpans []tracingpb.RecordedSpan) error { - if s.tracer.mode() == modeLegacy && s.crdb.recordingType() == RecordingOff { - return nil - } return s.crdb.importRemoteSpans(remoteSpans) } diff --git a/pkg/util/tracing/span_test.go b/pkg/util/tracing/span_test.go index 0866092f8f24..c0ab81007351 100644 --- a/pkg/util/tracing/span_test.go +++ b/pkg/util/tracing/span_test.go @@ -183,6 +183,26 @@ Span grandchild: require.Equal(t, exp, recToStrippedString(childRec)) } +func TestSpan_ImportRemoteSpans(t *testing.T) { + // Verify that GetRecording propagates the recording even when the + // receiving Span isn't verbose. + tr := NewTracer() + sp := tr.StartSpan("root", WithForceRealSpan()) + ch := tr.StartSpan("child", WithParentAndManualCollection(sp.Meta())) + ch.SetVerbose(true) + ch.Record("foo") + ch.SetVerbose(false) + ch.Finish() + require.NoError(t, sp.ImportRemoteSpans(ch.GetRecording())) + sp.Finish() + + require.NoError(t, TestingCheckRecordedSpans(sp.GetRecording(), ` +Span root: +Span child: + event: foo +`)) +} + func TestSpanRecordStructured(t *testing.T) { tr := NewTracer() tr._mode = int32(modeBackground) From 34d19cbb1753750ef010bddc47d2940a7e4d278c Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 17 Feb 2021 17:11:59 +0100 Subject: [PATCH 03/15] kvserver: disable a leaking trace span See https://github.com/cockroachdb/cockroach/issues/60677. The leak only becomes evident in the follow-up commit. Release note: None --- pkg/kv/kvserver/replica_raft.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 09d5d86bb504..ca0b62679e25 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -129,6 +129,13 @@ func (r *Replica) evalAndPropose( // Fork the proposal's context span so that the proposal's context // can outlive the original proposer's context. proposal.ctx, proposal.sp = tracing.ForkSpan(ctx, "async consensus") + { + // This span sometimes leaks. Disable it for the time being. + // + // Tracked in: https://github.com/cockroachdb/cockroach/issues/60677 + proposal.sp.Finish() + proposal.sp = nil + } // Signal the proposal's response channel immediately. reply := *proposal.Local.Reply From cd7f204762c6233bb4246912fee038324aef102c Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 8 Feb 2021 10:19:31 +0100 Subject: [PATCH 04/15] tracing: propagate non-recording spans across rpc boundaries We were previously propagating RPCs across boundaries only when they were verbose. We now propagate any span (except the noop span) regardless of current verbosity. This ensures that SQL (which always creates real, non-verbose spans by default) can trace the entirety of its operations. This enables the background collection of contention metadata in 21.1 and more generally, the idea of always-on tracing. Note that we had previously considered always-on tracing to mean avoiding the noop span altogether; due to memory retention issues[1][2] encountered we are taking a step back from that idea for the 21.1 release. Somehow creating additional trace spans seems to have fouled up some tracing-based tests. I can't boil that ocean here, so I filed a follow-up issue: #60672 [1]: https://github.com/cockroachdb/cockroach/pull/59370 [2]: https://github.com/cockroachdb/cockroach/pull/59431 Release note: None --- .../testdata/autocommit_nonmetamorphic | 28 ++++++++ pkg/util/tracing/crdbspan.go | 11 +--- pkg/util/tracing/grpc_interceptor.go | 2 +- pkg/util/tracing/span.go | 7 -- pkg/util/tracing/span_test.go | 1 - pkg/util/tracing/tracer.go | 16 ++--- pkg/util/tracing/tracer_test.go | 64 ++++++++++++++++--- 7 files changed, 93 insertions(+), 36 deletions(-) diff --git a/pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic b/pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic index 7191149f250a..0a016b360620 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic +++ b/pkg/sql/opt/exec/execbuilder/testdata/autocommit_nonmetamorphic @@ -110,6 +110,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 CPut, 1 EndTxn to (n1,s1):1 # TODO(radu): allow non-side-effecting projections. @@ -133,6 +134,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 2 CPut to (n1,s1):1 +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Insert with RETURNING statement with side-effects should not auto-commit. @@ -157,6 +159,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 2 CPut to (n1,s1):1 +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Another way to test the scenario above: generate an error and ensure that the @@ -192,6 +195,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 Put, 1 EndTxn to (n1,s1):1 # Multi-row upsert should auto-commit. @@ -213,6 +217,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1 # No auto-commit inside a transaction. @@ -261,6 +266,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1 # TODO(radu): allow non-side-effecting projections. @@ -284,6 +290,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Upsert with RETURNING statement with side-effects should not auto-commit. @@ -308,6 +315,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Another way to test the scenario above: generate an error and ensure that the @@ -343,6 +351,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1 @@ -394,6 +403,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' ---- dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Put, 1 EndTxn to (n1,s1):1 # TODO(radu): allow non-side-effecting projections. @@ -418,6 +429,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' ---- dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Update with RETURNING statement with side-effects should not auto-commit. @@ -443,6 +456,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' ---- dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Put to (n1,s1):1 +dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 # Another way to test the scenario above: generate an error and ensure that the @@ -478,6 +493,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 DelRng, 1 EndTxn to (n1,s1):1 # Multi-row delete should auto-commit. @@ -499,6 +516,8 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 1 Scan to (n1,s1):1 +dist sender send r36: sending batch 2 Put to (n1,s1):1 dist sender send r36: sending batch 1 DelRng, 1 EndTxn to (n1,s1):1 # No auto-commit inside a transaction. @@ -547,6 +566,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%PushTxn%' AND message NOT LIKE '%QueryTxn%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Del, 1 EndTxn to (n1,s1):1 @@ -570,6 +590,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Del to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 @@ -595,6 +616,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 2 Del to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 @@ -644,6 +666,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 Scan to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 @@ -667,6 +690,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 1 Put to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 @@ -692,6 +716,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 1 Del to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 @@ -720,6 +745,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND operation NOT LIKE '%async%' ---- dist sender send r36: sending batch 1 DelRng to (n1,s1):1 +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 1 Scan to (n1,s1):1 dist sender send r36: sending batch 1 Del, 1 EndTxn to (n1,s1):1 @@ -749,6 +775,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 @@ -774,6 +801,7 @@ WHERE message LIKE '%r$rangeid: sending batch%' AND message NOT LIKE '%QueryTxn%' AND operation NOT LIKE '%async%' ---- +dist sender send r36: sending batch 1 DelRng to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 2 CPut to (n1,s1):1 dist sender send r36: sending batch 1 EndTxn to (n1,s1):1 diff --git a/pkg/util/tracing/crdbspan.go b/pkg/util/tracing/crdbspan.go index 9be388dc021b..7fcd5aa4cf19 100644 --- a/pkg/util/tracing/crdbspan.go +++ b/pkg/util/tracing/crdbspan.go @@ -137,16 +137,9 @@ func (s *crdbSpan) disableRecording() { func (s *crdbSpan) getRecording(m mode) Recording { if s == nil { - return nil - } - if m == modeLegacy && s.recordingType() == RecordingOff { - // In legacy tracing (pre always-on), we avoid allocations when the - // Span is not actively recording. - // - // TODO(tbg): we could consider doing the same when background tracing - // is on but the current span contains "nothing of interest". - return nil + return nil // noop span } + s.mu.Lock() // The capacity here is approximate since we don't know how many grandchildren // there are. diff --git a/pkg/util/tracing/grpc_interceptor.go b/pkg/util/tracing/grpc_interceptor.go index e25393b59e74..3164a12e0ff5 100644 --- a/pkg/util/tracing/grpc_interceptor.go +++ b/pkg/util/tracing/grpc_interceptor.go @@ -73,7 +73,7 @@ func spanInclusionFuncForServer(t *Tracer, spanMeta *SpanMeta) bool { // configured to always trace, return true. The second part is particularly // useful for calls coming through the HTTP->RPC gateway (i.e. the AdminUI), // where client is never tracing. - return !spanMeta.isNilOrNoop() || t.AlwaysTrace() + return spanMeta != nil || t.AlwaysTrace() } // setSpanTags sets one or more tags on the given span according to the diff --git a/pkg/util/tracing/span.go b/pkg/util/tracing/span.go index a6a31afae64d..aa14bf8d252a 100644 --- a/pkg/util/tracing/span.go +++ b/pkg/util/tracing/span.go @@ -171,13 +171,6 @@ func (s *Span) IsBlackHole() bool { return s.crdb.recordingType() == RecordingOff && s.netTr == nil && s.ot == (otSpan{}) } -// isNilOrNoop returns true if the Span context is either nil -// or corresponds to a "no-op" Span. If this is true, any Span -// derived from this context will be a "black hole Span". -func (sm *SpanMeta) isNilOrNoop() bool { - return sm == nil || (sm.recordingType == RecordingOff && sm.shadowTracerType == "") -} - // SpanStats are stats that can be added to a Span. type SpanStats interface { protoutil.Message diff --git a/pkg/util/tracing/span_test.go b/pkg/util/tracing/span_test.go index c0ab81007351..bd7fea5c03cb 100644 --- a/pkg/util/tracing/span_test.go +++ b/pkg/util/tracing/span_test.go @@ -205,7 +205,6 @@ Span child: func TestSpanRecordStructured(t *testing.T) { tr := NewTracer() - tr._mode = int32(modeBackground) sp := tr.StartSpan("root", WithForceRealSpan()) defer sp.Finish() diff --git a/pkg/util/tracing/tracer.go b/pkg/util/tracing/tracer.go index e0e6ec4238a1..14ca50794d14 100644 --- a/pkg/util/tracing/tracer.go +++ b/pkg/util/tracing/tracer.go @@ -293,14 +293,12 @@ func (t *Tracer) startSpanGeneric( opts.LogTags = logtags.FromContext(ctx) } - // Avoid creating a real span when possible. If tracing is globally - // enabled, we always need to create spans. If the incoming - // span is recording (which implies that there is a parent) then - // we also have to create a real child. Additionally, if the - // caller explicitly asked for a real span they need to get one. - // In all other cases, a noop span will do. + // Avoid creating a real span when possible. If tracing is globally enabled, + // we always need to create spans. If the incoming span has a parent, then we + // also create a real child. Additionally, if the caller explicitly asked for + // a real span they need to get one. In all other cases, a noop span will do. if !t.AlwaysTrace() && - opts.recordingType() == RecordingOff && + opts.parentTraceID() == 0 && !opts.ForceRealSpan { return maybeWrapCtx(ctx, nil /* octx */, t.noopSpan) } @@ -532,7 +530,7 @@ func (fn textMapWriterFn) Set(key, val string) { // Carrier. This, alongside ExtractMetaFrom, can be used to carry span metadata // across process boundaries. See serializationFormat for more details. func (t *Tracer) InjectMetaInto(sm *SpanMeta, carrier Carrier) error { - if sm.isNilOrNoop() { + if sm == nil { // Fast path when tracing is disabled. ExtractMetaFrom will accept an // empty map as a noop context. return nil @@ -575,7 +573,7 @@ func (t *Tracer) InjectMetaInto(sm *SpanMeta, carrier Carrier) error { return nil } -var noopSpanMeta = &SpanMeta{} +var noopSpanMeta = (*SpanMeta)(nil) // ExtractMetaFrom is used to deserialize a span metadata (if any) from the // given Carrier. This, alongside InjectMetaFrom, can be used to carry span diff --git a/pkg/util/tracing/tracer_test.go b/pkg/util/tracing/tracer_test.go index 8bc59728aeb2..c0e319c581f8 100644 --- a/pkg/util/tracing/tracer_test.go +++ b/pkg/util/tracing/tracer_test.go @@ -46,6 +46,9 @@ func TestTracerRecording(t *testing.T) { } noop1.Record("hello") + // Noop span returns empty recording. + require.Equal(t, Recording(nil), noop1.GetRecording()) + noop2 := tr.StartSpan("noop2", WithParentAndManualCollection(noop1.Meta())) if !noop2.isNoop() { t.Error("expected noop child Span") @@ -61,12 +64,20 @@ func TestTracerRecording(t *testing.T) { t.Error("WithForceRealSpan Span should be black hole") } - // Unless recording is actually started, child spans are still noop. - noop3 := tr.StartSpan("noop3", WithParentAndManualCollection(s1.Meta())) - if !noop3.isNoop() { - t.Error("expected noop child Span") + // Initial recording of this fresh (real) span. + if err := TestingCheckRecordedSpans(s1.GetRecording(), ` + Span a: + tags: _unfinished=1 + `); err != nil { + t.Fatal(err) } - noop3.Finish() + + // Real parent --> real child. + real3 := tr.StartSpan("noop3", WithParentAndManualCollection(s1.Meta())) + if real3.isNoop() { + t.Error("expected real child Span") + } + real3.Finish() s1.Recordf("x=%d", 1) s1.SetVerbose(true) @@ -116,7 +127,7 @@ func TestTracerRecording(t *testing.T) { t.Fatal(err) } s3.Finish() - if err := TestingCheckRecordedSpans(s1.GetRecording(), ` + expS1v := ` Span a: tags: _unfinished=1 _verbose=1 event: x=2 @@ -126,12 +137,26 @@ func TestTracerRecording(t *testing.T) { Span c: tags: _verbose=1 tag=val event: x=4 - `); err != nil { + ` + if err := TestingCheckRecordedSpans(s1.GetRecording(), expS1v); err != nil { t.Fatal(err) } + // When we turn off verbosity and add more verbose logs, they get + // dropped, i.e. the recording is unchanged mod the _verbose flag. + expS1n := ` + Span a: + tags: _unfinished=1 + event: x=2 + Span b: + tags: _verbose=1 + event: x=3 + Span c: + tags: _verbose=1 tag=val + event: x=4 + ` s1.SetVerbose(false) s1.Recordf("x=%d", 100) - if err := TestingCheckRecordedSpans(s1.GetRecording(), ``); err != nil { + if err := TestingCheckRecordedSpans(s1.GetRecording(), expS1n); err != nil { t.Fatal(err) } @@ -223,7 +248,7 @@ func TestTracerInjectExtract(t *testing.T) { if err != nil { t.Fatal(err) } - if !wireSpanMeta.isNilOrNoop() { + if wireSpanMeta != nil { t.Errorf("expected noop context: %v", wireSpanMeta) } noop2 := tr2.StartSpan("remote op", WithParentAndManualCollection(wireSpanMeta)) @@ -292,6 +317,27 @@ func TestTracerInjectExtract(t *testing.T) { } } +func TestTracer_PropagateNonRecordingRealSpanAcrossRPCBoundaries(t *testing.T) { + // Verify that when a span is put on the wire on one end, and is checked + // against the span inclusion functions both on the client and server, a real + // span results in a real span. + tr1 := NewTracer() + sp1 := tr1.StartSpan("tr1.root", WithForceRealSpan()) + defer sp1.Finish() + carrier := metadataCarrier{MD: metadata.MD{}} + require.True(t, spanInclusionFuncForClient(sp1)) + require.NoError(t, tr1.InjectMetaInto(sp1.Meta(), carrier)) + require.Equal(t, 2, carrier.Len(), "%+v", carrier) // trace id and span id + + tr2 := NewTracer() + meta, err := tr2.ExtractMetaFrom(carrier) + require.NoError(t, err) + require.True(t, spanInclusionFuncForServer(tr2, meta)) + sp2 := tr2.StartSpan("tr2.child", WithParentAndManualCollection(meta)) + defer sp2.Finish() + require.NotZero(t, sp2.crdb.spanID) +} + func TestLightstepContext(t *testing.T) { tr := NewTracer() lsTr := lightstep.NewTracer(lightstep.Options{ From ac7b6da346fbf75a90dd933a8c930f0d998e5845 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 8 Feb 2021 10:20:18 +0100 Subject: [PATCH 05/15] logictest: improve contention_event Switch to the 5node family of clusters to test against (more realistic), and also make the query that collects the event touch multiple ranges, which make it "easier" to miss an event. cc @irfansharif - this could help with the discussion about DistSender in [1]. [1]: https://github.com/cockroachdb/cockroach/pull/59815 Release note: None --- .../testdata/logic_test/contention_event | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/pkg/sql/logictest/testdata/logic_test/contention_event b/pkg/sql/logictest/testdata/logic_test/contention_event index 9b46cec3953f..6141044540b9 100644 --- a/pkg/sql/logictest/testdata/logic_test/contention_event +++ b/pkg/sql/logictest/testdata/logic_test/contention_event @@ -1,3 +1,5 @@ +# LogicTest: 5node +# # Verify that ContentionEvents are emitted. This is mostly a sanity check - look # for the datadriven tests in `pkg/kv/kvserver/concurrency` for the actual events # that do get emitted in various contention scenarios. @@ -6,7 +8,8 @@ statement ok GRANT ADMIN TO testuser statement ok -CREATE TABLE kv (k VARCHAR PRIMARY KEY, v VARCHAR) +CREATE TABLE kv (k VARCHAR PRIMARY KEY, v VARCHAR); +ALTER TABLE kv SPLIT AT VALUES ('b'), ('d'), ('q'), ('z'); query TT @@ -26,22 +29,30 @@ user root statement ok SET tracing=on +# Scan all ranges of the table (note that we have intentionally +# split it into at least six ranges). This is better than a point +# lookup in that it gives tracing more of a chance to get things +# wrong due to DistSender parallelism. statement ok SET TRACING=on; BEGIN; SET TRANSACTION PRIORITY HIGH; -INSERT INTO kv VALUES('k', 'my v'); +SELECT * FROM kv; COMMIT; SET TRACING=off; # Check for the message emitted by (roachpb.ContentionEvent).String() as proof that a ContentionEvent # payload was added to the trace. # +# NB: it looks like there can be multiple (probably when there's an internal restart +# or something like that). Unfortunately, it is difficult in logic tests to log something +# that is not part of the scripted output. +# # NB: if the matcher here ever gets updated, make sure to update throughout this file. -query I -SELECT count(message) FROM [ SHOW TRACE FOR SESSION ] WHERE message LIKE '%conflicted with % on % for %' +query B +SELECT count(message) > 0 FROM [ SHOW TRACE FOR SESSION ] WHERE message LIKE '%conflicted with % on % for %' ---- -1 +true # Clean up. user testuser From 8b1b00ecb2b70ae5353c0dc7283911a7c2954a0f Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 15 Feb 2021 14:18:00 +0100 Subject: [PATCH 06/15] sql: small but essential tracing fixes 1. Require admin role for `crdb_internal.node_inflight_trace_spans` 2. Add num_structured column to that table (this can be seen as temporary for testing; we can remove it once #55733 is fully addressed and payloads can be introspected separately). 3. Add workaround for failure to finish sql txn spans (#59315) for use in logic tests, which together with 2.) and 4.) enables a better `contention_event` test. 4. Add `crdb_internal.trace_id` builtin. Release note: None --- docs/generated/sql/functions.md | 2 + pkg/sql/conn_executor.go | 7 +-- pkg/sql/crdb_internal.go | 18 +++++++ pkg/sql/exec_util.go | 6 +++ pkg/sql/logictest/logic.go | 11 ++++- .../testdata/logic_test/contention_event | 48 +++++++------------ .../testdata/logic_test/crdb_internal | 7 ++- .../testdata/logic_test/crdb_internal_tenant | 4 +- pkg/sql/sem/builtins/BUILD.bazel | 1 + pkg/sql/sem/builtins/builtins.go | 20 ++++++++ pkg/sql/txn_state.go | 13 +++-- 11 files changed, 94 insertions(+), 43 deletions(-) diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md index 93497b21bd88..fd830a090539 100644 --- a/docs/generated/sql/functions.md +++ b/docs/generated/sql/functions.md @@ -2648,6 +2648,8 @@ SELECT * FROM crdb_internal.check_consistency(true, ‘\x02’, ‘\x04’)

crdb_internal.set_vmodule(vmodule_string: string) → int

Set the equivalent of the --vmodule flag on the gateway node processing this request; it affords control over the logging verbosity of different files. Example syntax: crdb_internal.set_vmodule('recordio=2,file=1,gfs*=3'). Reset with: crdb_internal.set_vmodule(''). Raising the verbosity can severely affect performance.

+crdb_internal.trace_id() → int

Returns the current trace ID or an error if no trace is open.

+
current_database() → string

Returns the current database.

current_schema() → string

Returns the current schema.

diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index b8ae839749dc..85c50e07cebc 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -592,9 +592,10 @@ func (s *Server) newConnExecutor( nodeIDOrZero: nodeIDOrZero, clock: s.cfg.Clock, // Future transaction's monitors will inherits from sessionRootMon. - connMon: sessionRootMon, - tracer: s.cfg.AmbientCtx.Tracer, - settings: s.cfg.Settings, + connMon: sessionRootMon, + tracer: s.cfg.AmbientCtx.Tracer, + settings: s.cfg.Settings, + execTestingKnobs: s.GetExecutorConfig().TestingKnobs, }, memMetrics: memMetrics, planner: planner{execCfg: s.cfg, alloc: &rowenc.DatumAlloc{}}, diff --git a/pkg/sql/crdb_internal.go b/pkg/sql/crdb_internal.go index a55f39acf1fe..040584710734 100644 --- a/pkg/sql/crdb_internal.go +++ b/pkg/sql/crdb_internal.go @@ -66,6 +66,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tracing" "github.com/cockroachdb/errors" + ptypes "github.com/gogo/protobuf/types" ) // CrdbInternalName is the name of the crdb_internal schema. @@ -1148,9 +1149,18 @@ CREATE TABLE crdb_internal.node_inflight_trace_spans ( duration INTERVAL, -- The span's duration, measured from start to Finish(). -- A span whose recording is collected before it's finished will -- have the duration set as the "time of collection - start time". + num_payloads INT NOT NULL, -- The number of structured payloads in this span. operation STRING NULL -- The span's operation. )`, populate: func(ctx context.Context, p *planner, _ *dbdesc.Immutable, addRow func(...tree.Datum) error) error { + hasAdmin, err := p.HasAdminRole(ctx) + if err != nil { + return err + } + if !hasAdmin { + return pgerror.Newf(pgcode.InsufficientPrivilege, + "only users with the admin role are allowed to read crdb_internal.node_inflight_trace_spans") + } return p.ExecCfg().Settings.Tracer.VisitSpans(func(span *tracing.Span) error { for _, rec := range span.GetRecording() { traceID := rec.TraceID @@ -1167,7 +1177,14 @@ CREATE TABLE crdb_internal.node_inflight_trace_spans ( spanDuration := rec.Duration operation := rec.Operation + var numStructured int + rec.Structured(func(any *ptypes.Any) { + numStructured++ + }) + if err := addRow( + // TODO(angelapwen): we're casting uint64s to int64 here, + // is that ok? tree.NewDInt(tree.DInt(traceID)), tree.NewDInt(tree.DInt(parentSpanID)), tree.NewDInt(tree.DInt(spanID)), @@ -1178,6 +1195,7 @@ CREATE TABLE crdb_internal.node_inflight_trace_spans ( duration.MakeDuration(spanDuration.Nanoseconds(), 0, 0), types.DefaultIntervalTypeMetadata, ), + tree.NewDInt(tree.DInt(numStructured)), tree.NewDString(operation), ); err != nil { return err diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 0dfb85233cfa..ccbc30b4b0bb 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -921,6 +921,12 @@ type ExecutorTestingKnobs struct { // DeterministicExplainAnalyze, if set, will result in overriding fields in // EXPLAIN ANALYZE (PLAN) that can vary between runs (like elapsed times). DeterministicExplainAnalyze bool + + // Pretend59315IsFixed pretends that this issue is fixed: + // https://github.com/cockroachdb/cockroach/issues/59315 + // which means that we don't need the WithBypassRegistry option + // in resetForNewSQLTxn. + Pretend59315IsFixed bool } // PGWireTestingKnobs contains knobs for the pgwire module. diff --git a/pkg/sql/logictest/logic.go b/pkg/sql/logictest/logic.go index 4a0ef22f08fe..c4f175876e28 100644 --- a/pkg/sql/logictest/logic.go +++ b/pkg/sql/logictest/logic.go @@ -465,7 +465,8 @@ type testClusterConfig struct { isCCLConfig bool // localities is set if nodes should be set to a particular locality. // Nodes are 1-indexed. - localities map[int]roachpb.Locality + localities map[int]roachpb.Locality + pretend59315IsFixed bool } const threeNodeTenantConfigName = "3node-tenant" @@ -562,6 +563,13 @@ var logicTestConfigs = []testClusterConfig{ overrideDistSQLMode: "on", overrideAutoStats: "false", }, + { + name: "5node-pretend59315IsFixed", + numNodes: 5, + overrideDistSQLMode: "on", + overrideAutoStats: "false", + pretend59315IsFixed: true, + }, { name: "5node-metadata", numNodes: 5, @@ -1332,6 +1340,7 @@ func (t *logicTest) newCluster(serverArgs TestServerArgs) { }, SQLExecutor: &sql.ExecutorTestingKnobs{ DeterministicExplainAnalyze: true, + Pretend59315IsFixed: t.cfg.pretend59315IsFixed, }, }, ClusterName: "testclustername", diff --git a/pkg/sql/logictest/testdata/logic_test/contention_event b/pkg/sql/logictest/testdata/logic_test/contention_event index 6141044540b9..ede473134a10 100644 --- a/pkg/sql/logictest/testdata/logic_test/contention_event +++ b/pkg/sql/logictest/testdata/logic_test/contention_event @@ -1,4 +1,4 @@ -# LogicTest: 5node +# LogicTest: 5node-pretend59315IsFixed # # Verify that ContentionEvents are emitted. This is mostly a sanity check - look # for the datadriven tests in `pkg/kv/kvserver/concurrency` for the actual events @@ -11,7 +11,6 @@ statement ok CREATE TABLE kv (k VARCHAR PRIMARY KEY, v VARCHAR); ALTER TABLE kv SPLIT AT VALUES ('b'), ('d'), ('q'), ('z'); - query TT SELECT * FROM kv ---- @@ -26,49 +25,34 @@ INSERT INTO kv VALUES('k', 'v') user root -statement ok -SET tracing=on - # Scan all ranges of the table (note that we have intentionally # split it into at least six ranges). This is better than a point # lookup in that it gives tracing more of a chance to get things # wrong due to DistSender parallelism. statement ok -SET TRACING=on; BEGIN; SET TRANSACTION PRIORITY HIGH; SELECT * FROM kv; -COMMIT; -SET TRACING=off; - -# Check for the message emitted by (roachpb.ContentionEvent).String() as proof that a ContentionEvent -# payload was added to the trace. -# -# NB: it looks like there can be multiple (probably when there's an internal restart -# or something like that). Unfortunately, it is difficult in logic tests to log something -# that is not part of the scripted output. -# -# NB: if the matcher here ever gets updated, make sure to update throughout this file. -query B -SELECT count(message) > 0 FROM [ SHOW TRACE FOR SESSION ] WHERE message LIKE '%conflicted with % on % for %' ----- -true -# Clean up. user testuser statement ok ROLLBACK -# Do a straightforward non-conflicting insert; we want to check that there's *no* ContentionEvent -# emitted for those. There's no reason to believe it would be - this is a pure sanity check. - -statement ok -SET TRACING=on; -INSERT INTO kv VALUES ('l', 'lll'); -SET TRACING=off; +user root -query I -SELECT count(message) FROM [ SHOW TRACE FOR SESSION ] WHERE message LIKE '%conflicted with % on % for %' +# Check that the number of payloads in the open trace is at least 1. +# TODO(angelapwen): when we have a way to pull particular payloads for +# a trace, we should verify that we're seeing the right contention event. +# As is, the payloads could be something else (though we verified manually +# and there is a contention event). +# +# NB: the contention event is not in our trace span but in one of its +# children, so it wouldn't be found if we filtered by the trace span ID. +# +# NB: this needs the 5node-pretend59315 config because otherwise the span is not +# tracked. +query B +SELECT count(num_payloads) > 0 FROM crdb_internal.node_inflight_trace_spans WHERE trace_id = crdb_internal.trace_id(); ---- -0 +true diff --git a/pkg/sql/logictest/testdata/logic_test/crdb_internal b/pkg/sql/logictest/testdata/logic_test/crdb_internal index 0aa5a3111c18..d6c36a2eba62 100644 --- a/pkg/sql/logictest/testdata/logic_test/crdb_internal +++ b/pkg/sql/logictest/testdata/logic_test/crdb_internal @@ -250,10 +250,10 @@ SELECT * FROM crdb_internal.zones WHERE false zone_id subzone_id target range_name database_name table_name index_name partition_name raw_config_yaml raw_config_sql raw_config_protobuf full_config_yaml full_config_sql -query IIIIBTTT colnames +query IIIIBTTIT colnames SELECT * FROM crdb_internal.node_inflight_trace_spans WHERE span_id < 0 ---- -trace_id parent_span_id span_id goroutine_id finished start_time duration operation +trace_id parent_span_id span_id goroutine_id finished start_time duration num_payloads operation query ITTTTITTTTTTTTTI colnames SELECT * FROM crdb_internal.ranges WHERE range_id < 0 @@ -477,6 +477,9 @@ select * from crdb_internal.kv_store_status query error pq: only users with the admin role are allowed to read crdb_internal.gossip_alerts select * from crdb_internal.gossip_alerts +query error pq: only users with the admin role are allowed to read crdb_internal.node_inflight_trace_spans +select * from crdb_internal.node_inflight_trace_spans + # Anyone can see the executable version. query T select regexp_replace(crdb_internal.node_executable_version()::string, '(-\d+)?$', ''); diff --git a/pkg/sql/logictest/testdata/logic_test/crdb_internal_tenant b/pkg/sql/logictest/testdata/logic_test/crdb_internal_tenant index 2d4b9d49e710..c54eda79e560 100644 --- a/pkg/sql/logictest/testdata/logic_test/crdb_internal_tenant +++ b/pkg/sql/logictest/testdata/logic_test/crdb_internal_tenant @@ -262,10 +262,10 @@ SELECT * FROM crdb_internal.zones WHERE false zone_id subzone_id target range_name database_name table_name index_name partition_name raw_config_yaml raw_config_sql raw_config_protobuf full_config_yaml full_config_sql -query IIIIBTTT colnames +query IIIIBTTIT colnames SELECT * FROM crdb_internal.node_inflight_trace_spans WHERE span_id < 0 ---- -trace_id parent_span_id span_id goroutine_id finished start_time duration operation +trace_id parent_span_id span_id goroutine_id finished start_time duration num_payloads operation statement error not fully contained in tenant keyspace SELECT * FROM crdb_internal.ranges WHERE range_id < 0 diff --git a/pkg/sql/sem/builtins/BUILD.bazel b/pkg/sql/sem/builtins/BUILD.bazel index 9c7b8bc3e8ef..b699dd2b53ba 100644 --- a/pkg/sql/sem/builtins/BUILD.bazel +++ b/pkg/sql/sem/builtins/BUILD.bazel @@ -79,6 +79,7 @@ go_library( "//pkg/util/timeofday", "//pkg/util/timetz", "//pkg/util/timeutil", + "//pkg/util/tracing", "//pkg/util/unaccent", "//pkg/util/uuid", "@com_github_cockroachdb_apd_v2//:apd", diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index dce6113a236a..31f371c294a0 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -69,6 +69,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeofday" "github.com/cockroachdb/cockroach/pkg/util/timetz" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing" "github.com/cockroachdb/cockroach/pkg/util/unaccent" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" @@ -3578,6 +3579,25 @@ may increase either contention or retry errors, or both.`, }, ), + // Get the current trace ID. + "crdb_internal.trace_id": makeBuiltin( + tree.FunctionProperties{Category: categorySystemInfo}, + tree.Overload{ + Types: tree.ArgTypes{}, + ReturnType: tree.FixedReturnType(types.Int), + Fn: func(ctx *tree.EvalContext, args tree.Datums) (tree.Datum, error) { + sp := tracing.SpanFromContext(ctx.Context) + if sp == nil { + return tree.DNull, nil + } + return tree.NewDInt(tree.DInt(sp.GetRecording()[0].TraceID)), nil + }, + Info: "Returns the current trace ID or an error if no trace is open.", + // NB: possibly this is or could be made stable, but it's not worth it. + Volatility: tree.VolatilityVolatile, + }, + ), + "crdb_internal.locality_value": makeBuiltin( tree.FunctionProperties{Category: categorySystemInfo}, tree.Overload{ diff --git a/pkg/sql/txn_state.go b/pkg/sql/txn_state.go index d56e300e923e..9d0d867e463e 100644 --- a/pkg/sql/txn_state.go +++ b/pkg/sql/txn_state.go @@ -157,7 +157,13 @@ func (ts *txnState) resetForNewSQLTxn( // TODO(andrei): figure out how to close these spans on server shutdown? Ties // into a larger discussion about how to drain SQL and rollback open txns. opName := sqlTxnName - txnCtx, sp := createRootOrChildSpan(connCtx, opName, tranCtx.tracer, tracing.WithBypassRegistry()) + var traceOpts []tracing.SpanOption + if !tranCtx.execTestingKnobs.Pretend59315IsFixed { + // The surrounding conditional and this option can be removed once #59315 + // is addressed. + traceOpts = append(traceOpts, tracing.WithBypassRegistry()) + } + txnCtx, sp := createRootOrChildSpan(connCtx, opName, tranCtx.tracer, traceOpts...) if txnType == implicitTxn { sp.SetTag("implicit", "true") } @@ -384,8 +390,9 @@ type transitionCtx struct { tracer *tracing.Tracer // sessionTracing provides access to the session's tracing interface. The // state machine needs to see if session tracing is enabled. - sessionTracing *SessionTracing - settings *cluster.Settings + sessionTracing *SessionTracing + settings *cluster.Settings + execTestingKnobs ExecutorTestingKnobs } var noRewind = rewindCapability{} From 73bede1db2fda6996fcee615619346298c560447 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 15 Feb 2021 14:08:27 +0100 Subject: [PATCH 07/15] tracing: prevent sending unexpected recordings to v20.2 nodes Unfortunately, v20.2 fails when it is presented with a recording (from a remote node) that it did not expect to get created. Concretely, we can end up with a noopSpan having `ImportRemoteSpans` called on it, which is a fatal error (unhandled failed interface assertion). To avoid this particular problem and any others that may lurk around, simply never send recordings back that would not be created under v20.2, until we know there is no v20.2 node around any more. This is a straightforward introduction of our cluster version, with the exception that a little bit of dependency injection is necessary to avoid circular imports. Release note: None --- docs/generated/settings/settings.html | 2 +- pkg/clusterversion/cockroach_versions.go | 8 ++++++++ pkg/clusterversion/key_string.go | 5 +++-- pkg/settings/cluster/cluster_settings.go | 4 ++++ pkg/util/tracing/crdbspan.go | 19 +++++++++++++++++-- pkg/util/tracing/span.go | 2 +- pkg/util/tracing/tracer.go | 6 ++++++ pkg/util/tracing/tracer_test.go | 16 ++++++++++++++++ 8 files changed, 56 insertions(+), 6 deletions(-) diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 24c48ce4a678..87140ec213ab 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -101,6 +101,6 @@ trace.debug.enablebooleanfalseif set, traces for recent requests can be seen at https:///debug/requests trace.lightstep.tokenstringif set, traces go to Lightstep using this token trace.zipkin.collectorstringif set, traces go to the given Zipkin instance (example: '127.0.0.1:9411'); ignored if trace.lightstep.token is set -versionversion20.2-26set the active cluster version in the format '.' +versionversion20.2-28set the active cluster version in the format '.' diff --git a/pkg/clusterversion/cockroach_versions.go b/pkg/clusterversion/cockroach_versions.go index 91b6e0617480..7f5f56467ba7 100644 --- a/pkg/clusterversion/cockroach_versions.go +++ b/pkg/clusterversion/cockroach_versions.go @@ -237,6 +237,10 @@ const ( PostTruncatedAndRangeAppliedStateMigration // SeparatedIntents allows the writing of separated intents/locks. SeparatedIntents + // TracingVerbosityIndependentSemantics marks a change in which trace spans + // are propagated across RPC boundaries independently of their verbosity setting. + // This requires a version gate this violates implicit assumptions in v20.2. + TracingVerbosityIndependentSemantics // Step (1): Add new versions here. ) @@ -395,6 +399,10 @@ var versionsSingleton = keyedVersions([]keyedVersion{ Key: SeparatedIntents, Version: roachpb.Version{Major: 20, Minor: 2, Internal: 26}, }, + { + Key: TracingVerbosityIndependentSemantics, + Version: roachpb.Version{Major: 20, Minor: 2, Internal: 28}, + }, // Step (2): Add new versions here. }) diff --git a/pkg/clusterversion/key_string.go b/pkg/clusterversion/key_string.go index fd75c9e515ab..79613367595e 100644 --- a/pkg/clusterversion/key_string.go +++ b/pkg/clusterversion/key_string.go @@ -41,11 +41,12 @@ func _() { _ = x[TruncatedAndRangeAppliedStateMigration-30] _ = x[PostTruncatedAndRangeAppliedStateMigration-31] _ = x[SeparatedIntents-32] + _ = x[TracingVerbosityIndependentSemantics-33] } -const _Key_name = "NamespaceTableWithSchemasStart20_2GeospatialTypeEnumsRangefeedLeasesAlterColumnTypeGeneralAlterSystemJobsAddCreatedByColumnsAddScheduledJobsTableUserDefinedSchemasNoOriginFKIndexesNodeMembershipStatusMinPasswordLengthAbortSpanBytesAlterSystemJobsAddSqllivenessColumnsAddNewSystemSqllivenessTableMaterializedViewsBox2DTypeUpdateScheduledJobsSchemaCreateLoginPrivilegeHBAForNonTLSV20_2Start21_1EmptyArraysInInvertedIndexesUniqueWithoutIndexConstraintsVirtualComputedColumnsCPutInlineReplicaVersionsreplacedTruncatedAndRangeAppliedStateMigrationreplacedPostTruncatedAndRangeAppliedStateMigrationNewSchemaChangerLongRunningMigrationsTruncatedAndRangeAppliedStateMigrationPostTruncatedAndRangeAppliedStateMigrationSeparatedIntents" +const _Key_name = "NamespaceTableWithSchemasStart20_2GeospatialTypeEnumsRangefeedLeasesAlterColumnTypeGeneralAlterSystemJobsAddCreatedByColumnsAddScheduledJobsTableUserDefinedSchemasNoOriginFKIndexesNodeMembershipStatusMinPasswordLengthAbortSpanBytesAlterSystemJobsAddSqllivenessColumnsAddNewSystemSqllivenessTableMaterializedViewsBox2DTypeUpdateScheduledJobsSchemaCreateLoginPrivilegeHBAForNonTLSV20_2Start21_1EmptyArraysInInvertedIndexesUniqueWithoutIndexConstraintsVirtualComputedColumnsCPutInlineReplicaVersionsreplacedTruncatedAndRangeAppliedStateMigrationreplacedPostTruncatedAndRangeAppliedStateMigrationNewSchemaChangerLongRunningMigrationsTruncatedAndRangeAppliedStateMigrationPostTruncatedAndRangeAppliedStateMigrationSeparatedIntentsTracingVerbosityIndependentSemantics" -var _Key_index = [...]uint16{0, 25, 34, 48, 53, 68, 90, 124, 145, 163, 180, 200, 217, 231, 295, 312, 321, 346, 366, 378, 383, 392, 420, 449, 471, 481, 496, 542, 592, 608, 629, 667, 709, 725} +var _Key_index = [...]uint16{0, 25, 34, 48, 53, 68, 90, 124, 145, 163, 180, 200, 217, 231, 295, 312, 321, 346, 366, 378, 383, 392, 420, 449, 471, 481, 496, 542, 592, 608, 629, 667, 709, 725, 761} func (i Key) String() string { if i < 0 || i >= Key(len(_Key_index)-1) { diff --git a/pkg/settings/cluster/cluster_settings.go b/pkg/settings/cluster/cluster_settings.go index c1e96a86f948..7d6c461ba62a 100644 --- a/pkg/settings/cluster/cluster_settings.go +++ b/pkg/settings/cluster/cluster_settings.go @@ -121,6 +121,10 @@ func MakeClusterSettings() *Settings { sv.Init(s.Version) s.Tracer = tracing.NewTracer() + s.Tracer.TracingVerbosityIndependentSemanticsIsActive = func() bool { + return s.Version.IsActive(context.Background(), + clusterversion.TracingVerbosityIndependentSemantics) + } s.Tracer.Configure(sv) return s diff --git a/pkg/util/tracing/crdbspan.go b/pkg/util/tracing/crdbspan.go index 7fcd5aa4cf19..a2ced2b9b2dc 100644 --- a/pkg/util/tracing/crdbspan.go +++ b/pkg/util/tracing/crdbspan.go @@ -135,12 +135,27 @@ func (s *crdbSpan) disableRecording() { } } -func (s *crdbSpan) getRecording(m mode) Recording { +func (s *crdbSpan) getRecording(m mode, everyoneIsV211 bool) Recording { if s == nil { return nil // noop span } s.mu.Lock() + + if !everyoneIsV211 { + // The cluster may contain nodes that are running v20.2. Unfortunately that + // version can easily crash when a peer returns a recording that that node + // did not expect would get created. To circumvent this, retain the v20.2 + // behavior of eliding recordings when verbosity is off until we're sure + // that v20.2 is not around any longer. + // + // TODO(tbg): remove this in the v21.2 cycle. + if m == modeLegacy && s.recordingType() == RecordingOff { + s.mu.Unlock() + return nil + } + } + // The capacity here is approximate since we don't know how many grandchildren // there are. result := make(Recording, 0, 1+len(s.mu.recording.children)+len(s.mu.recording.remoteSpans)) @@ -151,7 +166,7 @@ func (s *crdbSpan) getRecording(m mode) Recording { s.mu.Unlock() for _, child := range children { - result = append(result, child.getRecording(m)...) + result = append(result, child.getRecording(m, everyoneIsV211)...) } // Sort the spans by StartTime, except the first Span (the root of this diff --git a/pkg/util/tracing/span.go b/pkg/util/tracing/span.go index aa14bf8d252a..9dc8358b1de2 100644 --- a/pkg/util/tracing/span.go +++ b/pkg/util/tracing/span.go @@ -149,7 +149,7 @@ func (s *Span) SetVerbose(to bool) { // enabled. This can be called while spans that are part of the recording are // still open; it can run concurrently with operations on those spans. func (s *Span) GetRecording() Recording { - return s.crdb.getRecording(s.tracer.mode()) + return s.crdb.getRecording(s.tracer.mode(), s.tracer.TracingVerbosityIndependentSemanticsIsActive()) } // ImportRemoteSpans adds RecordedSpan data to the recording of the given Span; diff --git a/pkg/util/tracing/tracer.go b/pkg/util/tracing/tracer.go index 14ca50794d14..b34ed6b35e64 100644 --- a/pkg/util/tracing/tracer.go +++ b/pkg/util/tracing/tracer.go @@ -167,6 +167,11 @@ type Tracer struct { syncutil.Mutex m map[uint64]*Span } + // TracingVerbosityIndependentSemanticsIsActive is really + // version.IsActive(TracingVerbosityIndependentSemanticsIsActive) + // but gets injected this way to avoid import cycles. It defaults + // to a function that returns `true`. + TracingVerbosityIndependentSemanticsIsActive func() bool includeAsyncSpansInRecordings bool // see TestingIncludeAsyncSpansInRecordings } @@ -177,6 +182,7 @@ type Tracer struct { func NewTracer() *Tracer { t := &Tracer{} t.activeSpans.m = make(map[uint64]*Span) + t.TracingVerbosityIndependentSemanticsIsActive = func() bool { return true } t.noopSpan = &Span{tracer: t} return t } diff --git a/pkg/util/tracing/tracer_test.go b/pkg/util/tracing/tracer_test.go index c0e319c581f8..e96efc584948 100644 --- a/pkg/util/tracing/tracer_test.go +++ b/pkg/util/tracing/tracer_test.go @@ -538,3 +538,19 @@ func TestSpanRecordingFinished(t *testing.T) { spanOpsWithFinished = getSpanOpsWithFinished(t, tr1) require.Len(t, spanOpsWithFinished, 0) } + +func TestTracer_TracingVerbosityIndependentSemanticsIsActive(t *testing.T) { + // Verify that GetRecording() returns nil for non-verbose spans if we're in + // mixed-version mode. + tr := NewTracer() + tr.TracingVerbosityIndependentSemanticsIsActive = func() bool { return false } + sp := tr.StartSpan("root", WithForceRealSpan()) + defer sp.Finish() + sp.SetVerbose(true) + sp.Record("foo") + require.NotNil(t, sp.GetRecording()) + sp.SetVerbose(false) + require.Nil(t, sp.GetRecording()) + tr.TracingVerbosityIndependentSemanticsIsActive = func() bool { return true } + require.NotNil(t, sp.GetRecording()) +} From 52e616ac7aec6349791d609e7136849158b33d16 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 16 Feb 2021 10:45:19 +0100 Subject: [PATCH 08/15] tracing: simplify noop span conditional Suggested by @knz. Release note: None --- pkg/util/tracing/tracer.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pkg/util/tracing/tracer.go b/pkg/util/tracing/tracer.go index b34ed6b35e64..3ed10b929521 100644 --- a/pkg/util/tracing/tracer.go +++ b/pkg/util/tracing/tracer.go @@ -299,13 +299,9 @@ func (t *Tracer) startSpanGeneric( opts.LogTags = logtags.FromContext(ctx) } - // Avoid creating a real span when possible. If tracing is globally enabled, - // we always need to create spans. If the incoming span has a parent, then we - // also create a real child. Additionally, if the caller explicitly asked for - // a real span they need to get one. In all other cases, a noop span will do. - if !t.AlwaysTrace() && - opts.parentTraceID() == 0 && - !opts.ForceRealSpan { + // Are we tracing everything, or have a parent, or want a real span? Then + // we create a real trace span. In all other cases, a noop span will do. + if !(t.AlwaysTrace() || opts.parentTraceID() != 0 || opts.ForceRealSpan) { return maybeWrapCtx(ctx, nil /* octx */, t.noopSpan) } From 712261f69f7f72b307c66ff33a754612b0a904ad Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 16 Feb 2021 12:03:24 +0100 Subject: [PATCH 09/15] tracing: add real tests for the grpc interceptors As was rightly pointed out by @knz during review in #59992, the tracing interceptors were only tested indirectly. In this commit, we add explicit tests that set up a gRPC server and verify that for all combinations of unary/streaming the correct trace spans get created. Release note: None --- pkg/testutils/grpcutils/BUILD.bazel | 28 ++ pkg/testutils/grpcutils/testserver.go | 47 ++++ pkg/testutils/grpcutils/testservice.pb.go | 312 ++++++++++++++++++++++ pkg/testutils/grpcutils/testservice.proto | 25 ++ pkg/testutils/lint/lint_test.go | 1 + pkg/util/tracing/BUILD.bazel | 8 + pkg/util/tracing/grpc_interceptor.go | 2 +- pkg/util/tracing/grpc_interceptor_test.go | 225 ++++++++++++++++ 8 files changed, 647 insertions(+), 1 deletion(-) create mode 100644 pkg/testutils/grpcutils/BUILD.bazel create mode 100644 pkg/testutils/grpcutils/testserver.go create mode 100644 pkg/testutils/grpcutils/testservice.pb.go create mode 100644 pkg/testutils/grpcutils/testservice.proto create mode 100644 pkg/util/tracing/grpc_interceptor_test.go diff --git a/pkg/testutils/grpcutils/BUILD.bazel b/pkg/testutils/grpcutils/BUILD.bazel new file mode 100644 index 000000000000..07469ebd0436 --- /dev/null +++ b/pkg/testutils/grpcutils/BUILD.bazel @@ -0,0 +1,28 @@ +load("@rules_proto//proto:defs.bzl", "proto_library") +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") + +proto_library( + name = "grpcutils_proto", + srcs = ["testservice.proto"], + strip_import_prefix = "/pkg", + visibility = ["//visibility:public"], + deps = ["@com_google_protobuf//:any_proto"], +) + +go_proto_library( + name = "grpcutils_go_proto", + compilers = ["//pkg/cmd/protoc-gen-gogoroach:protoc-gen-gogoroach_grpc_compiler"], + importpath = "github.com/cockroachdb/cockroach/pkg/testutils/grpcutils", + proto = ":grpcutils_proto", + visibility = ["//visibility:public"], +) + +go_library( + name = "grpcutils", + srcs = ["testserver.go"], + embed = [":grpcutils_go_proto"], + importpath = "github.com/cockroachdb/cockroach/pkg/testutils/grpcutils", + visibility = ["//visibility:public"], + deps = ["@com_github_gogo_protobuf//types"], +) diff --git a/pkg/testutils/grpcutils/testserver.go b/pkg/testutils/grpcutils/testserver.go new file mode 100644 index 000000000000..6d7e411c3e5a --- /dev/null +++ b/pkg/testutils/grpcutils/testserver.go @@ -0,0 +1,47 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package grpcutils + +import ( + "context" + + "github.com/gogo/protobuf/types" +) + +// TestServerImpl backs the Test service. +type TestServerImpl struct { + UU func(context.Context, *types.Any) (*types.Any, error) // UnaryUnary + US func(*types.Any, GRPCTest_UnaryStreamServer) error // UnaryStream + SU func(server GRPCTest_StreamUnaryServer) error // StreamUnary + SS func(server GRPCTest_StreamStreamServer) error // StreamStream +} + +var _ GRPCTestServer = (*TestServerImpl)(nil) + +// UnaryUnary implements GRPCTestServer. +func (s *TestServerImpl) UnaryUnary(ctx context.Context, any *types.Any) (*types.Any, error) { + return s.UU(ctx, any) +} + +// UnaryStream implements GRPCTestServer. +func (s *TestServerImpl) UnaryStream(any *types.Any, server GRPCTest_UnaryStreamServer) error { + return s.US(any, server) +} + +// StreamUnary implements GRPCTestServer. +func (s *TestServerImpl) StreamUnary(server GRPCTest_StreamUnaryServer) error { + return s.SU(server) +} + +// StreamStream implements GRPCTestServer. +func (s *TestServerImpl) StreamStream(server GRPCTest_StreamStreamServer) error { + return s.SS(server) +} diff --git a/pkg/testutils/grpcutils/testservice.pb.go b/pkg/testutils/grpcutils/testservice.pb.go new file mode 100644 index 000000000000..5813f3a4f01e --- /dev/null +++ b/pkg/testutils/grpcutils/testservice.pb.go @@ -0,0 +1,312 @@ +// Code generated by protoc-gen-gogo. DO NOT EDIT. +// source: testutils/grpcutils/testservice.proto + +package grpcutils + +import proto "github.com/gogo/protobuf/proto" +import fmt "fmt" +import math "math" +import types "github.com/gogo/protobuf/types" + +import ( + context "context" + grpc "google.golang.org/grpc" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package + +// Reference imports to suppress errors if they are not otherwise used. +var _ context.Context +var _ grpc.ClientConn + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +const _ = grpc.SupportPackageIsVersion4 + +// GRPCTestClient is the client API for GRPCTest service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. +type GRPCTestClient interface { + UnaryUnary(ctx context.Context, in *types.Any, opts ...grpc.CallOption) (*types.Any, error) + UnaryStream(ctx context.Context, in *types.Any, opts ...grpc.CallOption) (GRPCTest_UnaryStreamClient, error) + StreamUnary(ctx context.Context, opts ...grpc.CallOption) (GRPCTest_StreamUnaryClient, error) + StreamStream(ctx context.Context, opts ...grpc.CallOption) (GRPCTest_StreamStreamClient, error) +} + +type gRPCTestClient struct { + cc *grpc.ClientConn +} + +func NewGRPCTestClient(cc *grpc.ClientConn) GRPCTestClient { + return &gRPCTestClient{cc} +} + +func (c *gRPCTestClient) UnaryUnary(ctx context.Context, in *types.Any, opts ...grpc.CallOption) (*types.Any, error) { + out := new(types.Any) + err := c.cc.Invoke(ctx, "/cockroach.testutils.grpcutils.GRPCTest/UnaryUnary", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *gRPCTestClient) UnaryStream(ctx context.Context, in *types.Any, opts ...grpc.CallOption) (GRPCTest_UnaryStreamClient, error) { + stream, err := c.cc.NewStream(ctx, &_GRPCTest_serviceDesc.Streams[0], "/cockroach.testutils.grpcutils.GRPCTest/UnaryStream", opts...) + if err != nil { + return nil, err + } + x := &gRPCTestUnaryStreamClient{stream} + if err := x.ClientStream.SendMsg(in); err != nil { + return nil, err + } + if err := x.ClientStream.CloseSend(); err != nil { + return nil, err + } + return x, nil +} + +type GRPCTest_UnaryStreamClient interface { + Recv() (*types.Any, error) + grpc.ClientStream +} + +type gRPCTestUnaryStreamClient struct { + grpc.ClientStream +} + +func (x *gRPCTestUnaryStreamClient) Recv() (*types.Any, error) { + m := new(types.Any) + if err := x.ClientStream.RecvMsg(m); err != nil { + return nil, err + } + return m, nil +} + +func (c *gRPCTestClient) StreamUnary(ctx context.Context, opts ...grpc.CallOption) (GRPCTest_StreamUnaryClient, error) { + stream, err := c.cc.NewStream(ctx, &_GRPCTest_serviceDesc.Streams[1], "/cockroach.testutils.grpcutils.GRPCTest/StreamUnary", opts...) + if err != nil { + return nil, err + } + x := &gRPCTestStreamUnaryClient{stream} + return x, nil +} + +type GRPCTest_StreamUnaryClient interface { + Send(*types.Any) error + CloseAndRecv() (*types.Any, error) + grpc.ClientStream +} + +type gRPCTestStreamUnaryClient struct { + grpc.ClientStream +} + +func (x *gRPCTestStreamUnaryClient) Send(m *types.Any) error { + return x.ClientStream.SendMsg(m) +} + +func (x *gRPCTestStreamUnaryClient) CloseAndRecv() (*types.Any, error) { + if err := x.ClientStream.CloseSend(); err != nil { + return nil, err + } + m := new(types.Any) + if err := x.ClientStream.RecvMsg(m); err != nil { + return nil, err + } + return m, nil +} + +func (c *gRPCTestClient) StreamStream(ctx context.Context, opts ...grpc.CallOption) (GRPCTest_StreamStreamClient, error) { + stream, err := c.cc.NewStream(ctx, &_GRPCTest_serviceDesc.Streams[2], "/cockroach.testutils.grpcutils.GRPCTest/StreamStream", opts...) + if err != nil { + return nil, err + } + x := &gRPCTestStreamStreamClient{stream} + return x, nil +} + +type GRPCTest_StreamStreamClient interface { + Send(*types.Any) error + Recv() (*types.Any, error) + grpc.ClientStream +} + +type gRPCTestStreamStreamClient struct { + grpc.ClientStream +} + +func (x *gRPCTestStreamStreamClient) Send(m *types.Any) error { + return x.ClientStream.SendMsg(m) +} + +func (x *gRPCTestStreamStreamClient) Recv() (*types.Any, error) { + m := new(types.Any) + if err := x.ClientStream.RecvMsg(m); err != nil { + return nil, err + } + return m, nil +} + +// GRPCTestServer is the server API for GRPCTest service. +type GRPCTestServer interface { + UnaryUnary(context.Context, *types.Any) (*types.Any, error) + UnaryStream(*types.Any, GRPCTest_UnaryStreamServer) error + StreamUnary(GRPCTest_StreamUnaryServer) error + StreamStream(GRPCTest_StreamStreamServer) error +} + +func RegisterGRPCTestServer(s *grpc.Server, srv GRPCTestServer) { + s.RegisterService(&_GRPCTest_serviceDesc, srv) +} + +func _GRPCTest_UnaryUnary_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(types.Any) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(GRPCTestServer).UnaryUnary(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/cockroach.testutils.grpcutils.GRPCTest/UnaryUnary", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(GRPCTestServer).UnaryUnary(ctx, req.(*types.Any)) + } + return interceptor(ctx, in, info, handler) +} + +func _GRPCTest_UnaryStream_Handler(srv interface{}, stream grpc.ServerStream) error { + m := new(types.Any) + if err := stream.RecvMsg(m); err != nil { + return err + } + return srv.(GRPCTestServer).UnaryStream(m, &gRPCTestUnaryStreamServer{stream}) +} + +type GRPCTest_UnaryStreamServer interface { + Send(*types.Any) error + grpc.ServerStream +} + +type gRPCTestUnaryStreamServer struct { + grpc.ServerStream +} + +func (x *gRPCTestUnaryStreamServer) Send(m *types.Any) error { + return x.ServerStream.SendMsg(m) +} + +func _GRPCTest_StreamUnary_Handler(srv interface{}, stream grpc.ServerStream) error { + return srv.(GRPCTestServer).StreamUnary(&gRPCTestStreamUnaryServer{stream}) +} + +type GRPCTest_StreamUnaryServer interface { + SendAndClose(*types.Any) error + Recv() (*types.Any, error) + grpc.ServerStream +} + +type gRPCTestStreamUnaryServer struct { + grpc.ServerStream +} + +func (x *gRPCTestStreamUnaryServer) SendAndClose(m *types.Any) error { + return x.ServerStream.SendMsg(m) +} + +func (x *gRPCTestStreamUnaryServer) Recv() (*types.Any, error) { + m := new(types.Any) + if err := x.ServerStream.RecvMsg(m); err != nil { + return nil, err + } + return m, nil +} + +func _GRPCTest_StreamStream_Handler(srv interface{}, stream grpc.ServerStream) error { + return srv.(GRPCTestServer).StreamStream(&gRPCTestStreamStreamServer{stream}) +} + +type GRPCTest_StreamStreamServer interface { + Send(*types.Any) error + Recv() (*types.Any, error) + grpc.ServerStream +} + +type gRPCTestStreamStreamServer struct { + grpc.ServerStream +} + +func (x *gRPCTestStreamStreamServer) Send(m *types.Any) error { + return x.ServerStream.SendMsg(m) +} + +func (x *gRPCTestStreamStreamServer) Recv() (*types.Any, error) { + m := new(types.Any) + if err := x.ServerStream.RecvMsg(m); err != nil { + return nil, err + } + return m, nil +} + +var _GRPCTest_serviceDesc = grpc.ServiceDesc{ + ServiceName: "cockroach.testutils.grpcutils.GRPCTest", + HandlerType: (*GRPCTestServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "UnaryUnary", + Handler: _GRPCTest_UnaryUnary_Handler, + }, + }, + Streams: []grpc.StreamDesc{ + { + StreamName: "UnaryStream", + Handler: _GRPCTest_UnaryStream_Handler, + ServerStreams: true, + }, + { + StreamName: "StreamUnary", + Handler: _GRPCTest_StreamUnary_Handler, + ClientStreams: true, + }, + { + StreamName: "StreamStream", + Handler: _GRPCTest_StreamStream_Handler, + ServerStreams: true, + ClientStreams: true, + }, + }, + Metadata: "testutils/grpcutils/testservice.proto", +} + +func init() { + proto.RegisterFile("testutils/grpcutils/testservice.proto", fileDescriptor_testservice_e326d78c9e5539cb) +} + +var fileDescriptor_testservice_e326d78c9e5539cb = []byte{ + // 217 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x52, 0x2d, 0x49, 0x2d, 0x2e, + 0x29, 0x2d, 0xc9, 0xcc, 0x29, 0xd6, 0x4f, 0x2f, 0x2a, 0x48, 0x86, 0xb0, 0x40, 0x62, 0xc5, 0xa9, + 0x45, 0x65, 0x99, 0xc9, 0xa9, 0x7a, 0x05, 0x45, 0xf9, 0x25, 0xf9, 0x42, 0xb2, 0xc9, 0xf9, 0xc9, + 0xd9, 0x45, 0xf9, 0x89, 0xc9, 0x19, 0x7a, 0x70, 0x0d, 0x7a, 0x70, 0x0d, 0x52, 0x92, 0xe9, 0xf9, + 0xf9, 0xe9, 0x39, 0xa9, 0xfa, 0x60, 0xc5, 0x49, 0xa5, 0x69, 0xfa, 0x89, 0x79, 0x95, 0x10, 0x9d, + 0x46, 0x6d, 0x4c, 0x5c, 0x1c, 0xee, 0x41, 0x01, 0xce, 0x21, 0xa9, 0xc5, 0x25, 0x42, 0x56, 0x5c, + 0x5c, 0xa1, 0x79, 0x89, 0x45, 0x95, 0x60, 0x42, 0x48, 0x44, 0x0f, 0xa2, 0x4d, 0x0f, 0xa6, 0x4d, + 0xcf, 0x31, 0xaf, 0x52, 0x0a, 0xab, 0xa8, 0x12, 0x83, 0x90, 0x2d, 0x17, 0x37, 0x58, 0x5b, 0x70, + 0x49, 0x51, 0x6a, 0x62, 0x2e, 0x69, 0x9a, 0x0d, 0x18, 0x41, 0xda, 0x21, 0x3a, 0xc9, 0xb0, 0x5b, + 0x83, 0x51, 0xc8, 0x81, 0x8b, 0x07, 0xa2, 0x9d, 0x1c, 0xeb, 0x35, 0x18, 0x0d, 0x18, 0x9d, 0xb4, + 0x4f, 0x3c, 0x94, 0x63, 0x38, 0xf1, 0x48, 0x8e, 0xf1, 0xc2, 0x23, 0x39, 0xc6, 0x1b, 0x8f, 0xe4, + 0x18, 0x1f, 0x3c, 0x92, 0x63, 0x9c, 0xf0, 0x58, 0x8e, 0xe1, 0xc2, 0x63, 0x39, 0x86, 0x1b, 0x8f, + 0xe5, 0x18, 0xa2, 0x38, 0xe1, 0x01, 0x9a, 0xc4, 0x06, 0x36, 0xc0, 0x18, 0x10, 0x00, 0x00, 0xff, + 0xff, 0x29, 0xc3, 0x4b, 0x1d, 0x9f, 0x01, 0x00, 0x00, +} diff --git a/pkg/testutils/grpcutils/testservice.proto b/pkg/testutils/grpcutils/testservice.proto new file mode 100644 index 000000000000..ce1ebf1293dd --- /dev/null +++ b/pkg/testutils/grpcutils/testservice.proto @@ -0,0 +1,25 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +syntax = "proto3"; +package cockroach.testutils.grpcutils; +option go_package = "grpcutils"; + +import "google/protobuf/any.proto"; + +// GRPCTest is a generic test service that operates on arbitrary protobufs (as +// Anys) and exposes all combinations of streaming and unary RPCs. +service GRPCTest { + rpc UnaryUnary (google.protobuf.Any) returns (google.protobuf.Any) {} + rpc UnaryStream (google.protobuf.Any) returns (stream google.protobuf.Any) {} + rpc StreamUnary (stream google.protobuf.Any) returns (google.protobuf.Any) {} + rpc StreamStream (stream google.protobuf.Any) returns (stream google.protobuf.Any) {} +} + diff --git a/pkg/testutils/lint/lint_test.go b/pkg/testutils/lint/lint_test.go index 308df3312a95..7eba8d5f20fd 100644 --- a/pkg/testutils/lint/lint_test.go +++ b/pkg/testutils/lint/lint_test.go @@ -814,6 +814,7 @@ func TestLint(t *testing.T) { ":!util/grpcutil/grpc_util_test.go", ":!cli/systembench/network_test_server.go", ":!server/testserver.go", + ":!util/tracing/*_test.go", ) if err != nil { t.Fatal(err) diff --git a/pkg/util/tracing/BUILD.bazel b/pkg/util/tracing/BUILD.bazel index 67efe9734509..7e6333dae0df 100644 --- a/pkg/util/tracing/BUILD.bazel +++ b/pkg/util/tracing/BUILD.bazel @@ -52,6 +52,7 @@ go_test( size = "small", srcs = [ "alloc_test.go", + "grpc_interceptor_test.go", "helpers_test.go", "span_test.go", "tags_test.go", @@ -60,7 +61,13 @@ go_test( embed = [":tracing"], deps = [ "//pkg/settings", + "//pkg/testutils", + "//pkg/testutils/grpcutils", + "//pkg/util", "//pkg/util/iterutil", + "//pkg/util/leaktest", + "//pkg/util/stop", + "//pkg/util/tracing/tracingpb", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_logtags//:logtags", "@com_github_gogo_protobuf//types", @@ -68,6 +75,7 @@ go_test( "@com_github_opentracing_opentracing_go//:opentracing-go", "@com_github_opentracing_opentracing_go//log", "@com_github_stretchr_testify//require", + "@org_golang_google_grpc//:go_default_library", "@org_golang_google_grpc//metadata", ], ) diff --git a/pkg/util/tracing/grpc_interceptor.go b/pkg/util/tracing/grpc_interceptor.go index 3164a12e0ff5..271b4cabd68a 100644 --- a/pkg/util/tracing/grpc_interceptor.go +++ b/pkg/util/tracing/grpc_interceptor.go @@ -351,7 +351,7 @@ func newTracingClientStream( // The client span is being finished by another code path; hence, no // action is necessary. case <-cs.Context().Done(): - finishFunc(cs.Context().Err()) + finishFunc(nil) } }() otcs := &tracingClientStream{ diff --git a/pkg/util/tracing/grpc_interceptor_test.go b/pkg/util/tracing/grpc_interceptor_test.go new file mode 100644 index 000000000000..507eabcb7c1e --- /dev/null +++ b/pkg/util/tracing/grpc_interceptor_test.go @@ -0,0 +1,225 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package tracing_test + +import ( + "context" + "fmt" + "net" + "testing" + + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/grpcutils" + "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/cockroach/pkg/util/tracing" + "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" + "github.com/cockroachdb/errors" + "github.com/gogo/protobuf/types" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" +) + +// TestGRPCInterceptors verifies that the streaming and unary tracing +// interceptors work as advertised. We expect to see a span on the client side +// and a span on the server side. +func TestGRPCInterceptors(t *testing.T) { + defer leaktest.AfterTest(t)() + + const ( + k = "test-baggage-key" + v = "test-baggage-value" + magicValue = 150 + ) + + checkForSpanAndReturnRecording := func(ctx context.Context) (*types.Any, error) { + sp := tracing.SpanFromContext(ctx) + if sp == nil { + return nil, errors.New("no span in ctx") + } + actV, ok := sp.GetRecording()[0].Baggage[k] + if !ok { + return nil, errors.Newf("%s not set in baggage", k) + } + if v != actV { + return nil, errors.Newf("expected %v, got %v instead", v, actV) + } + + sp.RecordStructured(&types.Int32Value{Value: magicValue}) + recs := sp.GetRecording() + if len(recs) != 1 { + return nil, errors.Newf("expected exactly one recorded span, not %+v", recs) + } + return types.MarshalAny(&recs[0]) + } + + s := stop.NewStopper() + defer s.Stop(context.Background()) + + tr := tracing.NewTracer() + srv := grpc.NewServer( + grpc.UnaryInterceptor(tracing.ServerInterceptor(tr)), + grpc.StreamInterceptor(tracing.StreamServerInterceptor(tr)), + ) + impl := &grpcutils.TestServerImpl{ + UU: func(ctx context.Context, any *types.Any) (*types.Any, error) { + return checkForSpanAndReturnRecording(ctx) + }, + US: func(_ *types.Any, server grpcutils.GRPCTest_UnaryStreamServer) error { + any, err := checkForSpanAndReturnRecording(server.Context()) + if err != nil { + return err + } + return server.Send(any) + }, + SU: func(server grpcutils.GRPCTest_StreamUnaryServer) error { + _, err := server.Recv() + if err != nil { + return err + } + any, err := checkForSpanAndReturnRecording(server.Context()) + if err != nil { + return err + } + return server.SendAndClose(any) + }, + SS: func(server grpcutils.GRPCTest_StreamStreamServer) error { + _, err := server.Recv() + if err != nil { + return err + } + any, err := checkForSpanAndReturnRecording(server.Context()) + if err != nil { + return err + } + return server.Send(any) + }, + } + grpcutils.RegisterGRPCTestServer(srv, impl) + defer srv.GracefulStop() + ln, err := net.Listen(util.TestAddr.Network(), util.TestAddr.String()) + require.NoError(t, err) + require.NoError(t, s.RunAsyncTask(context.Background(), "serve", func(ctx context.Context) { + if err := srv.Serve(ln); err != nil { + t.Error(err) + } + })) + conn, err := grpc.DialContext(context.Background(), ln.Addr().String(), + grpc.WithInsecure(), + grpc.WithUnaryInterceptor(tracing.ClientInterceptor( + tr, func(sp *tracing.Span) { + sp.SetBaggageItem(k, v) + })), + grpc.WithStreamInterceptor(tracing.StreamClientInterceptor(tr, func(sp *tracing.Span) { + sp.SetBaggageItem(k, v) + })), + ) + require.NoError(t, err) + defer func() { + _ = conn.Close() // nolint:grpcconnclose + }() + + c := grpcutils.NewGRPCTestClient(conn) + unusedAny, err := types.MarshalAny(&types.Empty{}) + require.NoError(t, err) + + for _, tc := range []struct { + name string + do func(context.Context) (*types.Any, error) + }{ + { + name: "UnaryUnary", + do: func(ctx context.Context) (*types.Any, error) { + return c.UnaryUnary(ctx, unusedAny) + }, + }, + { + name: "UnaryStream", + do: func(ctx context.Context) (*types.Any, error) { + sc, err := c.UnaryStream(ctx, unusedAny) + if err != nil { + return nil, err + } + any, err := sc.Recv() + if err != nil { + return nil, err + } + return any, sc.CloseSend() + }, + }, + { + name: "StreamUnary", + do: func(ctx context.Context) (*types.Any, error) { + sc, err := c.StreamUnary(ctx) + if err != nil { + return nil, err + } + if err := sc.Send(unusedAny); err != nil { + return nil, err + } + return sc.CloseAndRecv() + }, + }, + { + name: "StreamStream", + do: func(ctx context.Context) (*types.Any, error) { + sc, err := c.StreamStream(ctx) + if err != nil { + return nil, err + } + if err := sc.Send(unusedAny); err != nil { + return nil, err + } + return sc.Recv() + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + ctx, sp := tr.StartSpanCtx(context.Background(), tc.name, tracing.WithForceRealSpan()) + recAny, err := tc.do(ctx) + require.NoError(t, err) + var rec tracingpb.RecordedSpan + require.NoError(t, types.UnmarshalAny(recAny, &rec)) + require.Len(t, rec.InternalStructured, 1) + require.NoError(t, sp.ImportRemoteSpans([]tracingpb.RecordedSpan{rec})) + sp.Finish() + var n int + finalRecs := sp.GetRecording() + for _, rec := range finalRecs { + n += len(rec.InternalStructured) + // Remove all of the _unfinished tags. These crop up because + // in this test we are pulling the recorder in the handler impl, + // but the span is only closed in the interceptor. Additionally, + // this differs between the streaming and unary interceptor, and + // it's not worth having to have a separate expectation for each. + // Note that we check that we're not leaking spans at the end of + // the test. + delete(rec.Tags, "_unfinished") + } + require.Equal(t, 1, n) + + exp := fmt.Sprintf(` +Span %[1]s: +Span /cockroach.testutils.grpcutils.GRPCTest/%[1]s: + tags: component=gRPC span.kind=client test-baggage-key=test-baggage-value +Span /cockroach.testutils.grpcutils.GRPCTest/%[1]s: + tags: component=gRPC span.kind=server test-baggage-key=test-baggage-value`, + tc.name) + require.NoError(t, tracing.TestingCheckRecordedSpans(finalRecs, exp)) + }) + } + testutils.SucceedsSoon(t, func() error { + return tr.VisitSpans(func(sp *tracing.Span) error { + return errors.Newf("leaked span: %s", sp.GetRecording()[0].Operation) + }) + }) +} From 67f3231e1d4f87f1ef7701e9daa6d8ab006347b8 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 8 Feb 2021 10:30:49 +0100 Subject: [PATCH 10/15] kvnemeses: enable global tracing in KVNemesisMultiNode Enabling tracing gives this test a chance to hit random NPEs. Release note: None --- pkg/kv/kvnemesis/kvnemesis_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/kv/kvnemesis/kvnemesis_test.go b/pkg/kv/kvnemesis/kvnemesis_test.go index 5c8fd5c9dcc0..86c8153f3a9d 100644 --- a/pkg/kv/kvnemesis/kvnemesis_test.go +++ b/pkg/kv/kvnemesis/kvnemesis_test.go @@ -70,6 +70,9 @@ func TestKVNemesisMultiNode(t *testing.T) { sqlDBs[i] = tc.ServerConn(i) } sqlutils.MakeSQLRunner(sqlDBs[0]).Exec(t, `SET CLUSTER SETTING kv.rangefeed.enabled = true`) + // Turn net/trace on, which results in real trace spans created throughout. + // This gives kvnemesis a chance to hit NPEs related to tracing. + sqlutils.MakeSQLRunner(sqlDBs[0]).Exec(t, `SET CLUSTER SETTING trace.debug.enable = true`) config := NewDefaultConfig() config.NumNodes, config.NumReplicas = numNodes, 3 From bba4d78d9fc734cc3279d836b10e5ac1fee8daef Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 8 Feb 2021 10:26:24 +0100 Subject: [PATCH 11/15] acceptance: enable tracing in acceptance/version-upgrade If we introduced NPEs due to incompatibilities between versions, we would want to find out. Enabling tracing globally while upgrading clusters should help. Release note: None --- pkg/cmd/roachtest/versionupgrade.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pkg/cmd/roachtest/versionupgrade.go b/pkg/cmd/roachtest/versionupgrade.go index 201e0d54377e..e63960b2446d 100644 --- a/pkg/cmd/roachtest/versionupgrade.go +++ b/pkg/cmd/roachtest/versionupgrade.go @@ -175,6 +175,14 @@ func runVersionUpgrade(ctx context.Context, t *test, c *cluster, buildVersion ve testFeaturesStep, // schemaChangeStep, backupStep, + // Turn tracing on globally to give it a fighting chance at exposing any + // crash-inducing incompatibilities or horrendous memory leaks. (It won't + // catch most memory leaks since this test doesn't run for too long or does + // too much work). Then, run the previous tests again. + enableTracingGloballyStep, + testFeaturesStep, + // schemaChangeStep, + backupStep, ) u.run(ctx, t) @@ -354,6 +362,15 @@ func binaryUpgradeStep(nodes nodeListOption, newVersion string) versionStep { } } +func enableTracingGloballyStep(ctx context.Context, t *test, u *versionUpgradeTest) { + db := u.conn(ctx, t, 1) + // NB: this enables net/trace, and as a side effect creates verbose trace spans everywhere. + _, err := db.ExecContext(ctx, `SET CLUSTER SETTING trace.debug.enable = $1`, true) + if err != nil { + t.Fatal(err) + } +} + func preventAutoUpgradeStep(node int) versionStep { return func(ctx context.Context, t *test, u *versionUpgradeTest) { db := u.conn(ctx, t, node) From 8346def4188ff61e9f5833075432a9ee951d4c6a Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 17 Feb 2021 13:51:51 +0100 Subject: [PATCH 12/15] testcluster: improve output of "no leaked trace spans" lint Release note: None --- pkg/testutils/testcluster/testcluster.go | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index f4d4827eec34..9fa3cb4cc09e 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -16,6 +16,7 @@ import ( "fmt" "net" "reflect" + "strings" "sync" "testing" "time" @@ -131,9 +132,21 @@ func (tc *TestCluster) stopServers(ctx context.Context) { // [1]: cleanupSessionTempObjects tracer := tc.Server(i).Tracer().(*tracing.Tracer) testutils.SucceedsSoon(tc.t, func() error { - return tracer.VisitSpans(func(span *tracing.Span) error { - return errors.Newf("expected to find no active spans, found %s", span.Meta()) + var sps []*tracing.Span + _ = tracer.VisitSpans(func(span *tracing.Span) error { + sps = append(sps, span) + return nil }) + if len(sps) == 0 { + return nil + } + var buf strings.Builder + buf.WriteString("unexpectedly found active spans:\n") + for _, sp := range sps { + fmt.Fprintln(&buf, sp.GetRecording()) + fmt.Fprintln(&buf) + } + return errors.Newf("%s", buf.String()) }) } } From 4422fd77061f835888db9eada79d854e738e89c8 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 17 Feb 2021 14:10:47 +0100 Subject: [PATCH 13/15] jobs: don't leak span when job is never started TestProtectedTimestampsFailDueToLimits fails without this change. I'm not in love with the way I fixed this - I think ideally a `StartableJob` should have one method that is called from `Start` or from `CleanupOnRollback`. As is, there are two cleanup paths, which is a pattern that will rot again. Filed #60671 to that effect. Release note: None --- pkg/jobs/jobs.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/jobs/jobs.go b/pkg/jobs/jobs.go index 49497c5e2d5a..78fbeb92deb9 100644 --- a/pkg/jobs/jobs.go +++ b/pkg/jobs/jobs.go @@ -915,6 +915,12 @@ func (sj *StartableJob) CleanupOnRollback(ctx context.Context) error { "cannot call CleanupOnRollback for a StartableJob with a non-finalized transaction") } sj.registry.unregister(*sj.ID()) + if sj.span != nil { + sj.span.Finish() + } + if sj.cancel != nil { + sj.cancel() + } return nil } From 3ae0f0638a7a8e2b5911ef57f88aa8494f2ac06c Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Wed, 17 Feb 2021 21:38:02 +0100 Subject: [PATCH 14/15] jobs: work around leaked trace span See https://github.com/cockroachdb/cockroach/issues/60671. Release note: None --- pkg/jobs/registry.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/jobs/registry.go b/pkg/jobs/registry.go index 603e778c91af..5cf53dc33c1f 100644 --- a/pkg/jobs/registry.go +++ b/pkg/jobs/registry.go @@ -517,6 +517,14 @@ func (r *Registry) CreateStartableJobWithTxn( _, span := tracing.ForkSpan(ctx, "job") if span != nil { resumerCtx = tracing.ContextWithSpan(resumerCtx, span) + + // This trace span unfortunately is sometimes never finished. + // As a hack/workaround, finish it now so that it leaves the + // tracer registry. + // + // Remove this when this issue is fixed: + // https://github.com/cockroachdb/cockroach/issues/60671 + span.Finish() } if r.startUsingSQLLivenessAdoption(ctx) { From f93e92b19b4d14dc815106de31957f2050337d7a Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Wed, 17 Feb 2021 22:32:11 +0100 Subject: [PATCH 15/15] tracing: be more resilient to double-Finish Our internal trace spans are fine, but net/trace tends to just crash (not always, just sometimes). We don't *want* spans getting finished twice, but it's hard to always avoid it, so add resilience. This was prompted by a crash I saw in acceptance/version-upgrade which very likely resulted from this hack: https://github.com/cockroachdb/cockroach/issues/60671#issuecomment-780835710 Release note: None --- pkg/util/tracing/BUILD.bazel | 1 + pkg/util/tracing/span.go | 12 ++++++++++++ pkg/util/tracing/span_test.go | 23 +++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/pkg/util/tracing/BUILD.bazel b/pkg/util/tracing/BUILD.bazel index 7e6333dae0df..1a28592f8a3e 100644 --- a/pkg/util/tracing/BUILD.bazel +++ b/pkg/util/tracing/BUILD.bazel @@ -77,5 +77,6 @@ go_test( "@com_github_stretchr_testify//require", "@org_golang_google_grpc//:go_default_library", "@org_golang_google_grpc//metadata", + "@org_golang_x_net//trace", ], ) diff --git a/pkg/util/tracing/span.go b/pkg/util/tracing/span.go index 9dc8358b1de2..bcdd03bfe766 100644 --- a/pkg/util/tracing/span.go +++ b/pkg/util/tracing/span.go @@ -208,9 +208,21 @@ func (s *Span) Finish() { return } finishTime := time.Now() + s.crdb.mu.Lock() + if alreadyFinished := s.crdb.mu.duration >= 0; alreadyFinished { + s.crdb.mu.Unlock() + + // External spans and net/trace are not always forgiving about spans getting + // finished twice, but it may happen so let's be resilient to it. + return + } s.crdb.mu.duration = finishTime.Sub(s.crdb.startTime) + if s.crdb.mu.duration == 0 { + s.crdb.mu.duration = time.Nanosecond + } s.crdb.mu.Unlock() + if s.ot.shadowSpan != nil { s.ot.shadowSpan.Finish() } diff --git a/pkg/util/tracing/span_test.go b/pkg/util/tracing/span_test.go index bd7fea5c03cb..da647ad23367 100644 --- a/pkg/util/tracing/span_test.go +++ b/pkg/util/tracing/span_test.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/errors" "github.com/gogo/protobuf/types" "github.com/stretchr/testify/require" + "golang.org/x/net/trace" "google.golang.org/grpc/metadata" ) @@ -250,3 +251,25 @@ func TestSpanMaxChildren(t *testing.T) { require.Len(t, sp.crdb.mu.recording.children, exp) } } + +type countingNetTrace struct { + trace.Trace + n int +} + +func (nt *countingNetTrace) Finish() { + nt.n++ + nt.Trace.Finish() +} + +func TestSpan_FinishTwice(t *testing.T) { + tr := NewTracer() + tr._useNetTrace = 1 + sp := tr.StartSpan("foo", WithForceRealSpan()) + require.NotNil(t, sp.netTr) + nt := &countingNetTrace{Trace: sp.netTr} + sp.netTr = nt + sp.Finish() + sp.Finish() + require.Equal(t, 1, nt.n) +}