Skip to content

Commit

Permalink
tracing,job: fix NPE in TraceCollector
Browse files Browse the repository at this point in the history
Previously, TraceCollector.StartIter would return a nil object
if we failed to resolve nodeliveness during initialization. This
led to a NPE.

This change now return a TraceCollector instance with the error field
set to the appropriate error, so that the validity check on the iterator
can correctly handle this scenario.

This change also reworks the dump trace on job cancellation test.
Job cancellation semantics under stress are slightly undeterministic in
terms of how many times execution of OnFailOrCancel is resumed. This makes
it hard to coordinate when to check and how many trace files to expect.

Fixes: #68315

Release note: None

Release justification: low risk, high benefit changes to existing functionality
  • Loading branch information
adityamaru authored and ajwerner committed Aug 27, 2021
1 parent c831eab commit 7f5ef69
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 7 deletions.
10 changes: 9 additions & 1 deletion pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/cockroach/pkg/util/tracing/collector"
"github.com/cockroachdb/errors"
)

Expand Down Expand Up @@ -1286,7 +1287,8 @@ CREATE TABLE crdb_internal.cluster_inflight_traces (
}

traceCollector := p.ExecCfg().TraceCollector
for iter := traceCollector.StartIter(ctx, traceID); iter.Valid(); iter.Next() {
var iter *collector.Iterator
for iter, err = traceCollector.StartIter(ctx, traceID); err == nil && iter.Valid(); iter.Next() {
nodeID, recording := iter.Value()
traceString := recording.String()
traceJaegerJSON, err := recording.ToJaegerJSON("", "", fmt.Sprintf("node %d", nodeID))
Expand All @@ -1301,6 +1303,12 @@ CREATE TABLE crdb_internal.cluster_inflight_traces (
return false, err
}
}
if err != nil {
return false, err
}
if iter.Error() != nil {
return false, iter.Error()
}

return true, nil
}}},
Expand Down
11 changes: 6 additions & 5 deletions pkg/util/tracing/collector/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,19 +87,20 @@ type Iterator struct {

// StartIter fetches the live nodes in the cluster, and configures the underlying
// Iterator that is used to access recorded spans in a streaming fashion.
func (t *TraceCollector) StartIter(ctx context.Context, traceID uint64) *Iterator {
func (t *TraceCollector) StartIter(ctx context.Context, traceID uint64) (*Iterator, error) {
tc := &Iterator{ctx: ctx, traceID: traceID, collector: t}
tc.liveNodes, tc.iterErr = nodesFromNodeLiveness(ctx, t.nodeliveness)
if tc.iterErr != nil {
return nil
var err error
tc.liveNodes, err = nodesFromNodeLiveness(ctx, t.nodeliveness)
if err != nil {
return nil, err
}

// Calling Next() positions the Iterator in a valid state. It will fetch the
// first set of valid (non-nil) inflight span recordings from the list of live
// nodes.
tc.Next()

return tc
return tc, nil
}

// Valid returns whether the Iterator is in a valid state to read values from.
Expand Down
4 changes: 3 additions & 1 deletion pkg/util/tracing/collector/collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,12 @@ func TestTracingCollectorGetSpanRecordings(t *testing.T) {
res := make(map[roachpb.NodeID][]tracing.Recording)

var iter *collector.Iterator
for iter = traceCollector.StartIter(ctx, traceID); iter.Valid(); iter.Next() {
var err error
for iter, err = traceCollector.StartIter(ctx, traceID); err == nil && iter.Valid(); iter.Next() {
nodeID, recording := iter.Value()
res[nodeID] = append(res[nodeID], recording)
}
require.NoError(t, err)
require.NoError(t, iter.Error())
return res
}
Expand Down

0 comments on commit 7f5ef69

Please sign in to comment.