From da87572d7bbc062ea672ef9a99913b1d75cee7bf Mon Sep 17 00:00:00 2001 From: Arthur Silva Sens Date: Fri, 16 Feb 2024 11:52:25 -0300 Subject: [PATCH] [processor/tailsampling] Add metric for sampled/not sampled spans (#30485) **Description:** Add metrics to measure sampled/not sampled spans. **Link to tracking Issue:** Fixes https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/30482 **Testing:** None **Documentation:** None --------- Signed-off-by: Arthur Silva Sens --- .chloggen/sampled_spans_metrics.yaml | 27 ++++ processor/tailsamplingprocessor/factory.go | 11 ++ processor/tailsamplingprocessor/go.mod | 2 + processor/tailsamplingprocessor/go.sum | 4 + processor/tailsamplingprocessor/metrics.go | 159 +++++++++---------- processor/tailsamplingprocessor/processor.go | 14 ++ 6 files changed, 134 insertions(+), 83 deletions(-) create mode 100644 .chloggen/sampled_spans_metrics.yaml diff --git a/.chloggen/sampled_spans_metrics.yaml b/.chloggen/sampled_spans_metrics.yaml new file mode 100644 index 000000000000..6f1d0222dd96 --- /dev/null +++ b/.chloggen/sampled_spans_metrics.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: processor/tail_sampling + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: "Add metrics that measure the number of sampled spans and the number of spans that are dropped due to sampling decisions." + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [30482] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/processor/tailsamplingprocessor/factory.go b/processor/tailsamplingprocessor/factory.go index ba608001bc88..c078a8627887 100644 --- a/processor/tailsamplingprocessor/factory.go +++ b/processor/tailsamplingprocessor/factory.go @@ -14,6 +14,7 @@ import ( "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/config/configtelemetry" "go.opentelemetry.io/collector/consumer" + "go.opentelemetry.io/collector/featuregate" "go.opentelemetry.io/collector/processor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/tailsamplingprocessor/internal/metadata" @@ -21,6 +22,16 @@ import ( var onceMetrics sync.Once +var metricStatCountSpansSampledFeatureGate = featuregate.GlobalRegistry().MustRegister( + "processor.tailsamplingprocessor.metricstatcountspanssampled", + featuregate.StageAlpha, + featuregate.WithRegisterDescription("When enabled, a new metric stat_count_spans_sampled will be available in the tail sampling processor. Differently from stat_count_traces_sampled, this metric will count the number of spans sampled or not per sampling policy, where the original counts traces."), +) + +func isMetricStatCountSpansSampledEnabled() bool { + return metricStatCountSpansSampledFeatureGate.IsEnabled() +} + // NewFactory returns a new factory for the Tail Sampling processor. func NewFactory() processor.Factory { onceMetrics.Do(func() { diff --git a/processor/tailsamplingprocessor/go.mod b/processor/tailsamplingprocessor/go.mod index 9bc5d027984a..9d742d180379 100644 --- a/processor/tailsamplingprocessor/go.mod +++ b/processor/tailsamplingprocessor/go.mod @@ -14,6 +14,7 @@ require ( go.opentelemetry.io/collector/config/configtelemetry v0.94.1 go.opentelemetry.io/collector/confmap v0.94.1 go.opentelemetry.io/collector/consumer v0.94.1 + go.opentelemetry.io/collector/featuregate v1.1.0 go.opentelemetry.io/collector/pdata v1.1.0 go.opentelemetry.io/collector/processor v0.94.1 go.opentelemetry.io/otel/metric v1.23.1 @@ -33,6 +34,7 @@ require ( github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.3 // indirect + github.com/hashicorp/go-version v1.6.0 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/iancoleman/strcase v0.3.0 // indirect github.com/json-iterator/go v1.1.12 // indirect diff --git a/processor/tailsamplingprocessor/go.sum b/processor/tailsamplingprocessor/go.sum index d83011749f5b..d7b4982354d7 100644 --- a/processor/tailsamplingprocessor/go.sum +++ b/processor/tailsamplingprocessor/go.sum @@ -61,6 +61,8 @@ github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/ github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mOkIeek= +github.com/hashicorp/go-version v1.6.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= @@ -128,6 +130,8 @@ go.opentelemetry.io/collector/confmap v0.94.1 h1:O69bkeyR1YPAFz+jMd45aDZc1DtYnwb go.opentelemetry.io/collector/confmap v0.94.1/go.mod h1:pCT5UtcHaHVJ5BIILv1Z2VQyjZzmT9uTdBmC9+Z0AgA= go.opentelemetry.io/collector/consumer v0.94.1 h1:l/9h5L71xr/d93snQ9fdxgz64C4UuB8mEDxpp456X8o= go.opentelemetry.io/collector/consumer v0.94.1/go.mod h1:BIPWmw8wES6jlPTPC+acJxLvUzIdOm6uh/p/X85ALsY= +go.opentelemetry.io/collector/featuregate v1.1.0 h1:W+/FKvRxHMFC6MuTTEgrHINCf1vFBvLH7stSOEar6zU= +go.opentelemetry.io/collector/featuregate v1.1.0/go.mod h1:QQXjP4etmJQhkQ20j4P/rapWuItYxoFozg/iIwuKnYg= go.opentelemetry.io/collector/pdata v1.1.0 h1:cE6Al1rQieUjMHro6p6cKwcu3sjHXGG59BZ3kRVUvsM= go.opentelemetry.io/collector/pdata v1.1.0/go.mod h1:IDkDj+B4Fp4wWOclBELN97zcb98HugJ8Q2gA4ZFsN8Q= go.opentelemetry.io/collector/processor v0.94.1 h1:cNlGox8fN85KhtUq6yuqgPM9KDCQ4O5aDQ864joc4JQ= diff --git a/processor/tailsamplingprocessor/metrics.go b/processor/tailsamplingprocessor/metrics.go index 9cefb51fd596..9a7e320063fe 100644 --- a/processor/tailsamplingprocessor/metrics.go +++ b/processor/tailsamplingprocessor/metrics.go @@ -28,6 +28,7 @@ var ( statPolicyEvaluationErrorCount = stats.Int64("sampling_policy_evaluation_error", "Count of sampling policy evaluation errors", stats.UnitDimensionless) statCountTracesSampled = stats.Int64("count_traces_sampled", "Count of traces that were sampled or not per sampling policy", stats.UnitDimensionless) + statCountSpansSampled = stats.Int64("count_spans_sampled", "Count of spans that were sampled or not per sampling policy", stats.UnitDimensionless) statCountGlobalTracesSampled = stats.Int64("global_count_traces_sampled", "Global count of traces that were sampled or not by at least one policy", stats.UnitDimensionless) statDroppedTooEarlyCount = stats.Int64("sampling_trace_dropped_too_early", "Count of traces that needed to be dropped before the configured wait time", stats.UnitDimensionless) @@ -46,90 +47,82 @@ func samplingProcessorMetricViews(level configtelemetry.Level) []*view.View { latencyDistributionAggregation := view.Distribution(1, 2, 5, 10, 25, 50, 75, 100, 150, 200, 300, 400, 500, 750, 1000, 2000, 3000, 4000, 5000, 10000, 20000, 30000, 50000) ageDistributionAggregation := view.Distribution(1, 2, 5, 10, 20, 30, 40, 50, 60, 90, 120, 180, 300, 600, 1800, 3600, 7200) - decisionLatencyView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statDecisionLatencyMicroSec.Name()), - Measure: statDecisionLatencyMicroSec, - Description: statDecisionLatencyMicroSec.Description(), - TagKeys: policyTagKeys, - Aggregation: latencyDistributionAggregation, - } - overallDecisionLatencyView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statOverallDecisionLatencyUs.Name()), - Measure: statOverallDecisionLatencyUs, - Description: statOverallDecisionLatencyUs.Description(), - Aggregation: latencyDistributionAggregation, - } - - traceRemovalAgeView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statTraceRemovalAgeSec.Name()), - Measure: statTraceRemovalAgeSec, - Description: statTraceRemovalAgeSec.Description(), - Aggregation: ageDistributionAggregation, - } - lateSpanArrivalView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statLateSpanArrivalAfterDecision.Name()), - Measure: statLateSpanArrivalAfterDecision, - Description: statLateSpanArrivalAfterDecision.Description(), - Aggregation: ageDistributionAggregation, - } - - countPolicyEvaluationErrorView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statPolicyEvaluationErrorCount.Name()), - Measure: statPolicyEvaluationErrorCount, - Description: statPolicyEvaluationErrorCount.Description(), - Aggregation: view.Sum(), - } - + views := make([]*view.View, 0) sampledTagKeys := []tag.Key{tagPolicyKey, tagSampledKey} - countTracesSampledView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statCountTracesSampled.Name()), - Measure: statCountTracesSampled, - Description: statCountTracesSampled.Description(), - TagKeys: sampledTagKeys, - Aggregation: view.Sum(), - } - - countGlobalTracesSampledView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statCountGlobalTracesSampled.Name()), - Measure: statCountGlobalTracesSampled, - Description: statCountGlobalTracesSampled.Description(), - TagKeys: []tag.Key{tagSampledKey}, - Aggregation: view.Sum(), + views = append(views, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statDecisionLatencyMicroSec.Name()), + Measure: statDecisionLatencyMicroSec, + Description: statDecisionLatencyMicroSec.Description(), + TagKeys: policyTagKeys, + Aggregation: latencyDistributionAggregation, + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statOverallDecisionLatencyUs.Name()), + Measure: statOverallDecisionLatencyUs, + Description: statOverallDecisionLatencyUs.Description(), + Aggregation: latencyDistributionAggregation, + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statTraceRemovalAgeSec.Name()), + Measure: statTraceRemovalAgeSec, + Description: statTraceRemovalAgeSec.Description(), + Aggregation: ageDistributionAggregation, + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statLateSpanArrivalAfterDecision.Name()), + Measure: statLateSpanArrivalAfterDecision, + Description: statLateSpanArrivalAfterDecision.Description(), + Aggregation: ageDistributionAggregation, + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statPolicyEvaluationErrorCount.Name()), + Measure: statPolicyEvaluationErrorCount, + Description: statPolicyEvaluationErrorCount.Description(), + Aggregation: view.Sum(), + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statCountTracesSampled.Name()), + Measure: statCountTracesSampled, + Description: statCountTracesSampled.Description(), + TagKeys: sampledTagKeys, + Aggregation: view.Sum(), + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statCountGlobalTracesSampled.Name()), + Measure: statCountGlobalTracesSampled, + Description: statCountGlobalTracesSampled.Description(), + TagKeys: []tag.Key{tagSampledKey}, + Aggregation: view.Sum(), + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statDroppedTooEarlyCount.Name()), + Measure: statDroppedTooEarlyCount, + Description: statDroppedTooEarlyCount.Description(), + Aggregation: view.Sum(), + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statNewTraceIDReceivedCount.Name()), + Measure: statNewTraceIDReceivedCount, + Description: statNewTraceIDReceivedCount.Description(), + Aggregation: view.Sum(), + }, + &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statTracesOnMemoryGauge.Name()), + Measure: statTracesOnMemoryGauge, + Description: statTracesOnMemoryGauge.Description(), + Aggregation: view.LastValue(), + }) + + if isMetricStatCountSpansSampledEnabled() { + views = append(views, &view.View{ + Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statCountSpansSampled.Name()), + Measure: statCountSpansSampled, + Description: statCountSpansSampled.Description(), + TagKeys: sampledTagKeys, + Aggregation: view.Sum(), + }) } - countTraceDroppedTooEarlyView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statDroppedTooEarlyCount.Name()), - Measure: statDroppedTooEarlyCount, - Description: statDroppedTooEarlyCount.Description(), - Aggregation: view.Sum(), - } - countTraceIDArrivalView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statNewTraceIDReceivedCount.Name()), - Measure: statNewTraceIDReceivedCount, - Description: statNewTraceIDReceivedCount.Description(), - Aggregation: view.Sum(), - } - trackTracesOnMemorylView := &view.View{ - Name: processorhelper.BuildCustomMetricName(metadata.Type.String(), statTracesOnMemoryGauge.Name()), - Measure: statTracesOnMemoryGauge, - Description: statTracesOnMemoryGauge.Description(), - Aggregation: view.LastValue(), - } - - return []*view.View{ - decisionLatencyView, - overallDecisionLatencyView, - - traceRemovalAgeView, - lateSpanArrivalView, - - countPolicyEvaluationErrorView, - - countTracesSampledView, - countGlobalTracesSampledView, - - countTraceDroppedTooEarlyView, - countTraceIDArrivalView, - trackTracesOnMemorylView, - } + return views } diff --git a/processor/tailsamplingprocessor/processor.go b/processor/tailsamplingprocessor/processor.go index 75d5b55559c3..9ffcb13846e0 100644 --- a/processor/tailsamplingprocessor/processor.go +++ b/processor/tailsamplingprocessor/processor.go @@ -308,6 +308,13 @@ func (tsp *tailSamplingSpanProcessor) makeDecision(id pcommon.TraceID, trace *sa mutators, statCountTracesSampled.M(int64(1)), ) + if isMetricStatCountSpansSampledEnabled() { + _ = stats.RecordWithTags( + p.ctx, + mutators, + statCountSpansSampled.M(trace.SpanCount.Load()), + ) + } metrics.decisionSampled++ case sampling.NotSampled: @@ -317,6 +324,13 @@ func (tsp *tailSamplingSpanProcessor) makeDecision(id pcommon.TraceID, trace *sa mutators, statCountTracesSampled.M(int64(1)), ) + if isMetricStatCountSpansSampledEnabled() { + _ = stats.RecordWithTags( + p.ctx, + mutators, + statCountSpansSampled.M(trace.SpanCount.Load()), + ) + } metrics.decisionNotSampled++ } }