From 2d661a7c463c9bac3d1e2f4413051c5ed7cd9cb8 Mon Sep 17 00:00:00 2001 From: Chris Mark Date: Thu, 13 Jun 2024 18:05:09 +0300 Subject: [PATCH 001/138] [chore] Add test for on startup compaction (#33511) **Description:** While there are [tests](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/extension/storage/filestorage/client_test.go#L214) for `OnRebound` compaction, I didn't spot any that cover the `OnStart` option. This PR adds a unit test for this case. **Link to tracking Issue:** ~ **Testing:** Added **Documentation:** ~ Signed-off-by: ChrsMark --- .../storage/filestorage/extension_test.go | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/extension/storage/filestorage/extension_test.go b/extension/storage/filestorage/extension_test.go index 83eaa6ce7bd7..a384423c60e9 100644 --- a/extension/storage/filestorage/extension_test.go +++ b/extension/storage/filestorage/extension_test.go @@ -17,6 +17,8 @@ import ( "go.opentelemetry.io/collector/component/componenttest" "go.opentelemetry.io/collector/extension/experimental/storage" "go.opentelemetry.io/collector/extension/extensiontest" + "go.uber.org/zap" + "go.uber.org/zap/zaptest/observer" ) func TestExtensionIntegrity(t *testing.T) { @@ -485,3 +487,41 @@ func TestCleanupOnStart(t *testing.T) { require.NoError(t, err) require.Equal(t, 1, len(files)) } + +func TestCompactionOnStart(t *testing.T) { + ctx := context.Background() + + logCore, logObserver := observer.New(zap.DebugLevel) + logger := zap.New(logCore) + set := extensiontest.NewNopSettings() + set.Logger = logger + + tempDir := t.TempDir() + temp, _ := os.CreateTemp(tempDir, TempDbPrefix) + temp.Close() + + f := NewFactory() + cfg := f.CreateDefaultConfig().(*Config) + cfg.Directory = tempDir + cfg.Compaction.Directory = tempDir + cfg.Compaction.OnStart = true + extension, err := f.CreateExtension(context.Background(), set, cfg) + require.NoError(t, err) + + se, ok := extension.(storage.Extension) + require.True(t, ok) + require.NoError(t, se.Start(ctx, componenttest.NewNopHost())) + + client, err := se.GetClient( + ctx, + component.KindReceiver, + newTestEntity("my_component"), + "", + ) + require.NoError(t, err) + t.Cleanup(func() { + // At least one compaction should have happened on start + require.GreaterOrEqual(t, len(logObserver.FilterMessage("finished compaction").All()), 1) + require.NoError(t, client.Close(context.TODO())) + }) +} From 9f0a3db02d2fb135e4ddc6bc03d35e2b8d7f23a2 Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 13 Jun 2024 09:13:34 -0700 Subject: [PATCH 002/138] [processor/probabilisticsampling] encoded sampling probability (support OTEP 235) (#31894) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Description:** Creates new sampler modes named "equalizing" and "proportional". Preserves existing functionality under the mode named "hash_seed". Fixes #31918 This is the final step in a sequence, the whole of this work was factored into 3+ PRs, including the new `pkg/sampling` and the previous step, #31946. The two new Sampler modes enable mixing OTel sampling SDKs with Collectors in a consistent way. The existing hash_seed mode is also a consistent sampling mode, which makes it possible to have a 1:1 mapping between its decisions and the OTEP 235 randomness and threshold values. Specifically, the 14-bit hash value and sampling probability are mapped into 56-bit R-value and T-value encodings, so that all sampling decisions in all modes include threshold information. This implements the semantic conventions of https://github.com/open-telemetry/semantic-conventions/pull/793, namely the `sampling.randomness` and `sampling.threshold` attributes used for logs where there is no tracestate. The default sampling mode remains HashSeed. We consider a future change of default to Proportional to be desirable, because: 1. Sampling probability is the same, only the hashing algorithm changes 2. Proportional respects and preserves information about earlier sampling decisions, which HashSeed can't do, so it has greater interoperability with OTel SDKs which may also adopt OTEP 235 samplers. **Link to tracking Issue:** Draft for https://github.com/open-telemetry/opentelemetry-specification/issues/3602. Previously https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/24811, see also https://github.com/open-telemetry/oteps/pull/235 Part of #29738 **Testing:** New testing has been added. **Documentation:** ✅ --------- Co-authored-by: Juraci Paixão Kröhling --- .chloggen/probabilisticsampler_modes.yaml | 27 + .../probabilisticsamplerprocessor/README.md | 156 +++- .../probabilisticsamplerprocessor/config.go | 65 +- .../config_test.go | 10 +- .../probabilisticsamplerprocessor/factory.go | 6 +- .../metadata/generated_telemetry_test.go | 3 +- .../logsprocessor.go | 130 +++- .../logsprocessor_test.go | 369 ++++++++-- .../sampler_mode.go | 304 ++++++-- .../sampler_mode_test.go | 31 + .../testdata/config.yaml | 2 + .../testdata/invalid_inf.yaml | 17 + .../testdata/invalid_prec.yaml | 18 + .../testdata/invalid_small.yaml | 18 + .../testdata/invalid_zero.yaml | 18 + .../tracesprocessor.go | 95 ++- .../tracesprocessor_test.go | 693 +++++++++++++++++- 17 files changed, 1807 insertions(+), 155 deletions(-) create mode 100644 .chloggen/probabilisticsampler_modes.yaml create mode 100644 processor/probabilisticsamplerprocessor/testdata/invalid_inf.yaml create mode 100644 processor/probabilisticsamplerprocessor/testdata/invalid_prec.yaml create mode 100644 processor/probabilisticsamplerprocessor/testdata/invalid_small.yaml create mode 100644 processor/probabilisticsamplerprocessor/testdata/invalid_zero.yaml diff --git a/.chloggen/probabilisticsampler_modes.yaml b/.chloggen/probabilisticsampler_modes.yaml new file mode 100644 index 000000000000..e823b78e1c2b --- /dev/null +++ b/.chloggen/probabilisticsampler_modes.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: probabilisticsamplerprocessor + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add Proportional and Equalizing sampling modes + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [31918] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: Both the existing hash_seed mode and the two new modes use OTEP 235 semantic conventions to encode sampling probability. + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/processor/probabilisticsamplerprocessor/README.md b/processor/probabilisticsamplerprocessor/README.md index 57728dc9d6f9..596ad23a38a6 100644 --- a/processor/probabilisticsamplerprocessor/README.md +++ b/processor/probabilisticsamplerprocessor/README.md @@ -1,3 +1,4 @@ + # Probabilistic Sampling Processor @@ -115,7 +116,9 @@ interpreted as a percentage, with values >= 100 equal to 100% sampling. The logs sampling priority attribute is configured via `sampling_priority`. -## Sampling algorithm +## Mode Selection + +There are three sampling modes available. All modes are consistent. ### Hash seed @@ -135,7 +138,154 @@ In order for hashing to be consistent, all collectors for a given tier at different collector tiers to support additional sampling requirements. -This mode uses 14 bits of sampling precision. +This mode uses 14 bits of information in its sampling decision; the +default `sampling_precision`, which is 4 hexadecimal digits, exactly +encodes this information. + +This mode is selected by default. + +#### Hash seed: Use-cases + +The hash seed mode is most useful in logs sampling, because it can be +applied to units of telemetry other than TraceID. For example, a +deployment consisting of 100 pods can be sampled according to the +`service.instance.id` resource attribute. In this case, 10% sampling +implies collecting log records from an expected value of 10 pods. + +### Proportional + +OpenTelemetry specifies a consistent sampling mechanism using 56 bits +of randomness, which may be obtained from the Trace ID according to +the W3C Trace Context Level 2 specification. Randomness can also be +explicly encoding in the OpenTelemetry `tracestate` field, where it is +known as the R-value. + +This mode is named because it reduces the number of items transmitted +proportionally, according to the sampling probability. In this mode, +items are selected for sampling without considering how much they were +already sampled by preceding samplers. + +This mode uses 56 bits of information in its calculations. The +default `sampling_precision` (4) will cause thresholds to be rounded +in some cases when they contain more than 16 significant bits. + +#### Proportional: Use-cases + +The proportional mode is generally applicable in trace sampling, +because it is based on OpenTelemetry and W3C specifications. This +mode is selected by default, because it enforces a predictable +(probabilistic) ratio between incoming items and outgoing items of +telemetry. No matter how SDKs and other sources of telemetry have +been configured with respect to sampling, a collector configured with +25% proportional sampling will output (an expected value of) 1 item +for every 4 items input. + +### Equalizing + +This mode uses the same randomness mechanism as the propotional +sampling mode, in this case considering how much each item was already +sampled by preceding samplers. This mode can be used to lower +sampling probability to a minimum value across a whole pipeline, +making it possible to conditionally adjust sampling probabilities. + +This mode compares a 56 bit threshold against the configured sampling +probability and updates when the threshold is larger. The default +`sampling_precision` (4) will cause updated thresholds to be rounded +in some cases when they contain more than 16 significant bits. + +#### Equalizing: Use-cases + +The equalizing mode is useful in collector deployments where client +SDKs have mixed sampling configuration and the user wants to apply a +uniform sampling probability across the system. For example, a user's +system consists of mostly components developed in-house, but also some +third-party software. Seeking to lower the overall cost of tracing, +the configures 10% sampling in the samplers for all of their in-house +components. This leaves third-party software components unsampled, +making the savings less than desired. In this case, the user could +configure a 10% equalizing probabilistic sampler. Already-sampled +items of telemetry from the in-house components will pass-through one +for one in this scenario, while items of telemetry from third-party +software will be sampled by the intended amount. + +## Sampling threshold information + +In all modes, information about the effective sampling probability is +added into the item of telemetry. The random variable that was used +may also be recorded, in case it was not derived from the TraceID +using a standard algorithm. + +For traces, threshold and optional randomness information are encoded +in the W3C Trace Context `tracestate` fields. The tracestate is +divided into sections according to a two-character vendor code; +OpenTelemetry uses "ot" as its section designator. Within the +OpenTelemetry section, the sampling threshold is encoded using "th" +and the optional random variable is encoded using "rv". + +For example, 25% sampling is encoded in a tracing Span as: + +``` +tracestate: ot=th:c +``` + +Users can randomness values in this way, independently, making it +possible to apply consistent sampling across traces for example. If +the Trace was initialized with pre-determined randomness value +`9b8233f7e3a151` and 100% sampling, it would read: + +``` +tracestate: ot=th:0;rv:9b8233f7e3a151 +``` + +This component, using either proportional or equalizing modes, could +apply 50% sampling the Span. This span with randomness value +`9b8233f7e3a151` is consistently sampled at 50% because the threshold, +when zero padded (i.e., `80000000000000`), is less than the randomess +value. The resulting span will have the following tracestate: + +``` +tracestate: ot=th:8;rv:9b8233f7e3a151 +``` + +For log records, threshold and randomness information are encoded in +the log record itself, using attributes. For example, 25% sampling +with an explicit randomness value is encoded as: + +``` +sampling.threshold: c +sampling.randomness: e05a99c8df8d32 +``` + +### Sampling precision + +When encoding sampling probability in the form of a threshold, +variable precision is permitted making it possible for the user to +restrict sampling probabilities to rounded numbers of fixed width. + +Because the threshold is encoded using hexadecimal digits, each digit +contributes 4 bits of information. One digit of sampling precision +can express exact sampling probabilities 1/16, 2/16, ... through +16/16. Two digits of sampling precision can express exact sampling +probabilities 1/256, 2/256, ... through 256/256. With N digits of +sampling precision, there are exactly `(2^N)-1` exactly representable +probabilities. + +Depending on the mode, there are different maximum reasonable settings +for this parameter. + +- The `hash_seed` mode uses a 14-bit hash function, therefore + precision 4 completely captures the available information. +- The `equalizing` mode configures a sampling probability after + parsing a `float32` value, which contains 20 bits of precision, + therefore precision 5 completely captures the available information. +- The `proportional` mode configures its ratio using a `float32` + value, however it carries out the arithmetic using 56-bits of + precision. In this mode, increasing precision has the effect + of preserving precision applied by preceding samplers. + +In cases where larger precision is configured than is actually +available, the added precision has no effect because trailing zeros +are eliminated by the encoding. ### Error handling @@ -153,9 +303,11 @@ false, in which case erroneous data will pass through the processor. The following configuration options can be modified: +- `mode` (string, optional): One of "proportional", "equalizing", or "hash_seed"; the default is "proportional" unless either `hash_seed` is configured or `attribute_source` is set to `record`. - `sampling_percentage` (32-bit floating point, required): Percentage at which items are sampled; >= 100 samples all items, 0 rejects all items. - `hash_seed` (32-bit unsigned integer, optional, default = 0): An integer used to compute the hash algorithm. Note that all collectors for a given tier (e.g. behind the same load balancer) should have the same hash_seed. - `fail_closed` (boolean, optional, default = true): Whether to reject items with sampling-related errors. +- `sampling_precision` (integer, optional, default = 4): Determines the number of hexadecimal digits used to encode the sampling threshold. Permitted values are 1..14. ### Logs-specific configuration diff --git a/processor/probabilisticsamplerprocessor/config.go b/processor/probabilisticsamplerprocessor/config.go index c4bc83eb6b11..b79d3136b02d 100644 --- a/processor/probabilisticsamplerprocessor/config.go +++ b/processor/probabilisticsamplerprocessor/config.go @@ -5,8 +5,11 @@ package probabilisticsamplerprocessor // import "github.com/open-telemetry/opent import ( "fmt" + "math" "go.opentelemetry.io/collector/component" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling" ) type AttributeSource string @@ -35,6 +38,33 @@ type Config struct { // different sampling rates, configuring different seeds avoids that. HashSeed uint32 `mapstructure:"hash_seed"` + // Mode selects the sampling behavior. Supported values: + // + // - "hash_seed": the legacy behavior of this processor. + // Using an FNV hash combined with the HashSeed value, this + // sampler performs a non-consistent probabilistic + // downsampling. The number of spans output is expected to + // equal SamplingPercentage (as a ratio) times the number of + // spans inpout, assuming good behavior from FNV and good + // entropy in the hashed attributes or TraceID. + // + // - "equalizing": Using an OTel-specified consistent sampling + // mechanism, this sampler selectively reduces the effective + // sampling probability of arriving spans. This can be + // useful to select a small fraction of complete traces from + // a stream with mixed sampling rates. The rate of spans + // passing through depends on how much sampling has already + // been applied. If an arriving span was head sampled at + // the same probability it passes through. If the span + // arrives with lower probability, a warning is logged + // because it means this sampler is configured with too + // large a sampling probability to ensure complete traces. + // + // - "proportional": Using an OTel-specified consistent sampling + // mechanism, this sampler reduces the effective sampling + // probability of each span by `SamplingProbability`. + Mode SamplerMode `mapstructure:"mode"` + // FailClosed indicates to not sample data (the processor will // fail "closed") in case of error, such as failure to parse // the tracestate field or missing the randomness attribute. @@ -45,6 +75,14 @@ type Config struct { // despite errors using priority. FailClosed bool `mapstructure:"fail_closed"` + // SamplingPrecision is how many hex digits of sampling + // threshold will be encoded, from 1 up to 14. Default is 4. + // 0 is treated as full precision. + SamplingPrecision int `mapstructure:"sampling_precision"` + + /////// + // Logs only fields below. + // AttributeSource (logs only) defines where to look for the attribute in from_attribute. The allowed values are // `traceID` or `record`. Default is `traceID`. AttributeSource `mapstructure:"attribute_source"` @@ -61,11 +99,34 @@ var _ component.Config = (*Config)(nil) // Validate checks if the processor configuration is valid func (cfg *Config) Validate() error { - if cfg.SamplingPercentage < 0 { - return fmt.Errorf("negative sampling rate: %.2f", cfg.SamplingPercentage) + pct := float64(cfg.SamplingPercentage) + + if math.IsInf(pct, 0) || math.IsNaN(pct) { + return fmt.Errorf("sampling rate is invalid: %f%%", cfg.SamplingPercentage) + } + ratio := pct / 100.0 + + switch { + case ratio < 0: + return fmt.Errorf("sampling rate is negative: %f%%", cfg.SamplingPercentage) + case ratio == 0: + // Special case + case ratio < sampling.MinSamplingProbability: + // Too-small case + return fmt.Errorf("sampling rate is too small: %g%%", cfg.SamplingPercentage) + default: + // Note that ratio > 1 is specifically allowed by the README, taken to mean 100% } + if cfg.AttributeSource != "" && !validAttributeSource[cfg.AttributeSource] { return fmt.Errorf("invalid attribute source: %v. Expected: %v or %v", cfg.AttributeSource, traceIDAttributeSource, recordAttributeSource) } + + if cfg.SamplingPrecision == 0 { + return fmt.Errorf("invalid sampling precision: 0") + } else if cfg.SamplingPrecision > sampling.NumHexDigits { + return fmt.Errorf("sampling precision is too great, should be <= 14: %d", cfg.SamplingPrecision) + } + return nil } diff --git a/processor/probabilisticsamplerprocessor/config_test.go b/processor/probabilisticsamplerprocessor/config_test.go index 3200f049569d..46477ca0c52e 100644 --- a/processor/probabilisticsamplerprocessor/config_test.go +++ b/processor/probabilisticsamplerprocessor/config_test.go @@ -26,6 +26,8 @@ func TestLoadConfig(t *testing.T) { id: component.NewIDWithName(metadata.Type, ""), expected: &Config{ SamplingPercentage: 15.3, + SamplingPrecision: 4, + Mode: "proportional", AttributeSource: "traceID", FailClosed: true, }, @@ -34,7 +36,9 @@ func TestLoadConfig(t *testing.T) { id: component.NewIDWithName(metadata.Type, "logs"), expected: &Config{ SamplingPercentage: 15.3, + SamplingPrecision: defaultPrecision, HashSeed: 22, + Mode: "", AttributeSource: "record", FromAttribute: "foo", SamplingPriority: "bar", @@ -68,7 +72,11 @@ func TestLoadInvalidConfig(t *testing.T) { file string contains string }{ - {"invalid_negative.yaml", "negative sampling rate"}, + {"invalid_negative.yaml", "sampling rate is negative"}, + {"invalid_small.yaml", "sampling rate is too small"}, + {"invalid_inf.yaml", "sampling rate is invalid: +Inf%"}, + {"invalid_prec.yaml", "sampling precision is too great"}, + {"invalid_zero.yaml", "invalid sampling precision"}, } { t.Run(test.file, func(t *testing.T) { factories, err := otelcoltest.NopFactories() diff --git a/processor/probabilisticsamplerprocessor/factory.go b/processor/probabilisticsamplerprocessor/factory.go index 35d594eb8443..ec8a96afb91d 100644 --- a/processor/probabilisticsamplerprocessor/factory.go +++ b/processor/probabilisticsamplerprocessor/factory.go @@ -40,8 +40,10 @@ func NewFactory() processor.Factory { func createDefaultConfig() component.Config { return &Config{ - AttributeSource: defaultAttributeSource, - FailClosed: true, + AttributeSource: defaultAttributeSource, + FailClosed: true, + Mode: modeUnset, + SamplingPrecision: defaultPrecision, } } diff --git a/processor/probabilisticsamplerprocessor/internal/metadata/generated_telemetry_test.go b/processor/probabilisticsamplerprocessor/internal/metadata/generated_telemetry_test.go index d1e2cff5b34e..ea85bfdad123 100644 --- a/processor/probabilisticsamplerprocessor/internal/metadata/generated_telemetry_test.go +++ b/processor/probabilisticsamplerprocessor/internal/metadata/generated_telemetry_test.go @@ -6,14 +6,13 @@ import ( "testing" "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component" "go.opentelemetry.io/otel/metric" embeddedmetric "go.opentelemetry.io/otel/metric/embedded" noopmetric "go.opentelemetry.io/otel/metric/noop" "go.opentelemetry.io/otel/trace" embeddedtrace "go.opentelemetry.io/otel/trace/embedded" nooptrace "go.opentelemetry.io/otel/trace/noop" - - "go.opentelemetry.io/collector/component" ) type mockMeter struct { diff --git a/processor/probabilisticsamplerprocessor/logsprocessor.go b/processor/probabilisticsamplerprocessor/logsprocessor.go index 0a7d0b8b892b..1a8e81507e6e 100644 --- a/processor/probabilisticsamplerprocessor/logsprocessor.go +++ b/processor/probabilisticsamplerprocessor/logsprocessor.go @@ -5,6 +5,7 @@ package probabilisticsamplerprocessor // import "github.com/open-telemetry/opent import ( "context" + "errors" "go.opentelemetry.io/collector/consumer" "go.opentelemetry.io/collector/pdata/pcommon" @@ -20,33 +21,107 @@ type logsProcessor struct { sampler dataSampler samplingPriority string + precision int failClosed bool logger *zap.Logger } type recordCarrier struct { record plog.LogRecord + + parsed struct { + tvalue string + threshold sampling.Threshold + + rvalue string + randomness sampling.Randomness + } } var _ samplingCarrier = &recordCarrier{} -func newLogRecordCarrier(l plog.LogRecord) samplingCarrier { - return &recordCarrier{ +func (rc *recordCarrier) get(key string) string { + val, ok := rc.record.Attributes().Get(key) + if !ok || val.Type() != pcommon.ValueTypeStr { + return "" + } + return val.Str() +} + +func newLogRecordCarrier(l plog.LogRecord) (samplingCarrier, error) { + var ret error + carrier := &recordCarrier{ record: l, } + if tvalue := carrier.get("sampling.threshold"); len(tvalue) != 0 { + th, err := sampling.TValueToThreshold(tvalue) + if err != nil { + ret = errors.Join(err, ret) + } else { + carrier.parsed.tvalue = tvalue + carrier.parsed.threshold = th + } + } + if rvalue := carrier.get("sampling.randomness"); len(rvalue) != 0 { + rnd, err := sampling.RValueToRandomness(rvalue) + if err != nil { + ret = errors.Join(err, ret) + } else { + carrier.parsed.rvalue = rvalue + carrier.parsed.randomness = rnd + } + } + return carrier, ret +} + +func (rc *recordCarrier) threshold() (sampling.Threshold, bool) { + return rc.parsed.threshold, len(rc.parsed.tvalue) != 0 +} + +func (rc *recordCarrier) explicitRandomness() (randomnessNamer, bool) { + if len(rc.parsed.rvalue) == 0 { + return newMissingRandomnessMethod(), false + } + return newSamplingRandomnessMethod(rc.parsed.randomness), true +} + +func (rc *recordCarrier) updateThreshold(th sampling.Threshold) error { + exist, has := rc.threshold() + if has && sampling.ThresholdLessThan(th, exist) { + return sampling.ErrInconsistentSampling + } + rc.record.Attributes().PutStr("sampling.threshold", th.TValue()) + return nil +} + +func (rc *recordCarrier) setExplicitRandomness(rnd randomnessNamer) { + rc.parsed.randomness = rnd.randomness() + rc.parsed.rvalue = rnd.randomness().RValue() + rc.record.Attributes().PutStr("sampling.randomness", rnd.randomness().RValue()) +} + +func (rc *recordCarrier) clearThreshold() { + rc.parsed.threshold = sampling.NeverSampleThreshold + rc.parsed.tvalue = "" + rc.record.Attributes().Remove("sampling.threshold") +} + +func (rc *recordCarrier) reserialize() error { + return nil } -func (*neverSampler) randomnessFromLogRecord(_ plog.LogRecord) (randomnessNamer, samplingCarrier, error) { +func (*neverSampler) randomnessFromLogRecord(logRec plog.LogRecord) (randomnessNamer, samplingCarrier, error) { // We return a fake randomness value, since it will not be used. // This avoids a consistency check error for missing randomness. - return newSamplingPriorityMethod(sampling.AllProbabilitiesRandomness), nil, nil + lrc, err := newLogRecordCarrier(logRec) + return newSamplingPriorityMethod(sampling.AllProbabilitiesRandomness), lrc, err } // randomnessFromLogRecord (hashingSampler) uses a hash function over -// the TraceID +// the TraceID or logs attribute source. func (th *hashingSampler) randomnessFromLogRecord(logRec plog.LogRecord) (randomnessNamer, samplingCarrier, error) { rnd := newMissingRandomnessMethod() - lrc := newLogRecordCarrier(logRec) + lrc, err := newLogRecordCarrier(logRec) if th.logsTraceIDEnabled { value := logRec.TraceID() @@ -67,15 +142,52 @@ func (th *hashingSampler) randomnessFromLogRecord(logRec plog.LogRecord) (random } } - return rnd, lrc, nil + if err != nil { + // The sampling.randomness or sampling.threshold attributes + // had a parse error, in this case. + lrc = nil + } else if _, hasRnd := lrc.explicitRandomness(); hasRnd { + // If the log record contains a randomness value, do not update. + err = ErrRandomnessInUse + lrc = nil + } else if _, hasTh := lrc.threshold(); hasTh { + // If the log record contains a threshold value, do not update. + err = ErrThresholdInUse + lrc = nil + } else if !isMissing(rnd) { + // When no sampling information is already present and we have + // calculated new randomness, add it to the record. + lrc.setExplicitRandomness(rnd) + } + + return rnd, lrc, err +} + +// randomnessFromLogRecord (hashingSampler) uses OTEP 235 semantic +// conventions basing its deicsion only on the TraceID. +func (ctc *consistentTracestateCommon) randomnessFromLogRecord(logRec plog.LogRecord) (randomnessNamer, samplingCarrier, error) { + lrc, err := newLogRecordCarrier(logRec) + rnd := newMissingRandomnessMethod() + + if err != nil { + // Parse error in sampling.randomness or sampling.threshold + lrc = nil + } else if rv, hasRnd := lrc.explicitRandomness(); hasRnd { + rnd = rv + } else if tid := logRec.TraceID(); !tid.IsEmpty() { + rnd = newTraceIDW3CSpecMethod(sampling.TraceIDToRandomness(tid)) + } + + return rnd, lrc, err } // newLogsProcessor returns a processor.LogsProcessor that will perform head sampling according to the given // configuration. func newLogsProcessor(ctx context.Context, set processor.Settings, nextConsumer consumer.Logs, cfg *Config) (processor.Logs, error) { lsp := &logsProcessor{ - sampler: makeSampler(cfg), + sampler: makeSampler(cfg, true), samplingPriority: cfg.SamplingPriority, + precision: cfg.SamplingPrecision, failClosed: cfg.FailClosed, logger: set.Logger, } @@ -144,7 +256,7 @@ func (lsp *logsProcessor) logRecordToPriorityThreshold(logRec plog.LogRecord) sa minProb = float64(localPriority.Int()) / 100.0 } if minProb != 0 { - if th, err := sampling.ProbabilityToThresholdWithPrecision(minProb, defaultPrecision); err == nil { + if th, err := sampling.ProbabilityToThresholdWithPrecision(minProb, lsp.precision); err == nil { // The record has supplied a valid alternative sampling probability return th } diff --git a/processor/probabilisticsamplerprocessor/logsprocessor_test.go b/processor/probabilisticsamplerprocessor/logsprocessor_test.go index e0181ca6e853..7cfeb896a230 100644 --- a/processor/probabilisticsamplerprocessor/logsprocessor_test.go +++ b/processor/probabilisticsamplerprocessor/logsprocessor_test.go @@ -18,6 +18,8 @@ import ( "go.opentelemetry.io/collector/processor/processortest" "go.uber.org/zap" "go.uber.org/zap/zaptest/observer" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling" ) func TestNewLogsProcessor(t *testing.T) { @@ -78,6 +80,11 @@ func TestLogsSampling(t *testing.T) { name: "nothing", cfg: &Config{ SamplingPercentage: 0, + + // FailClosed because the test + // includes one empty TraceID which + // would otherwise fail open. + FailClosed: true, }, received: 0, }, @@ -86,6 +93,7 @@ func TestLogsSampling(t *testing.T) { cfg: &Config{ SamplingPercentage: 50, AttributeSource: traceIDAttributeSource, + Mode: HashSeed, FailClosed: true, }, // Note: This count excludes one empty TraceID @@ -119,7 +127,11 @@ func TestLogsSampling(t *testing.T) { SamplingPercentage: 50, AttributeSource: recordAttributeSource, FromAttribute: "foo", - FailClosed: true, + + // FailClosed: true so that we do not + // sample when the attribute is + // missing. + FailClosed: true, }, received: 23, }, @@ -129,7 +141,11 @@ func TestLogsSampling(t *testing.T) { SamplingPercentage: 50, AttributeSource: recordAttributeSource, FromAttribute: "bar", - FailClosed: true, + + // FailClosed: true so that we do not + // sample when the attribute is + // missing. + FailClosed: true, }, received: 29, // probabilistic... doesn't yield the same results as foo }, @@ -191,76 +207,319 @@ func TestLogsSampling(t *testing.T) { } } -func TestLogsMissingRandomness(t *testing.T) { - type test struct { - pct float32 - source AttributeSource - failClosed bool - sampled bool - } +func TestLogsSamplingState(t *testing.T) { + // This hard-coded TraceID will sample at 50% and not at 49%. + // The equivalent randomness is 0x80000000000000. + var defaultTID = mustParseTID("fefefefefefefefefe80000000000000") - for _, tt := range []test{ - {0, recordAttributeSource, true, false}, - {50, recordAttributeSource, true, false}, - {100, recordAttributeSource, true, false}, - - {0, recordAttributeSource, false, false}, - {50, recordAttributeSource, false, true}, - {100, recordAttributeSource, false, true}, - - {0, traceIDAttributeSource, true, false}, - {50, traceIDAttributeSource, true, false}, - {100, traceIDAttributeSource, true, false}, - - {0, traceIDAttributeSource, false, false}, - {50, traceIDAttributeSource, false, true}, - {100, traceIDAttributeSource, false, true}, - } { - t.Run(fmt.Sprint(tt.pct, "_", tt.source, "_", tt.failClosed), func(t *testing.T) { + tests := []struct { + name string + cfg *Config + tid pcommon.TraceID + attrs map[string]any + log string + sampled bool + adjCount float64 + expect map[string]any + }{ + { + name: "100 percent traceID", + cfg: &Config{ + SamplingPercentage: 100, + AttributeSource: traceIDAttributeSource, + Mode: Proportional, + }, + tid: defaultTID, + attrs: map[string]any{ + "ignored": "value", + }, + sampled: true, + adjCount: 1, + expect: map[string]any{ + "sampling.threshold": "0", + "ignored": "value", + }, + }, + { + name: "100 percent traceID hash_seed", + cfg: &Config{ + SamplingPercentage: 100, + AttributeSource: traceIDAttributeSource, + Mode: "hash_seed", + HashSeed: 22, + }, + attrs: map[string]any{ + "K": "V", + }, + tid: defaultTID, + sampled: true, + adjCount: 1, + expect: map[string]any{ + "K": "V", + "sampling.threshold": "0", + "sampling.randomness": randomnessFromBytes(defaultTID[:], 22).RValue(), + }, + }, + { + name: "100 percent attribute", + cfg: &Config{ + SamplingPercentage: 100, + AttributeSource: recordAttributeSource, + FromAttribute: "veryrandom", + HashSeed: 49, + }, + attrs: map[string]any{ + "veryrandom": "1234", + }, + sampled: true, + adjCount: 1, + expect: map[string]any{ + "sampling.threshold": "0", + "sampling.randomness": randomnessFromBytes([]byte("1234"), 49).RValue(), + "veryrandom": "1234", + }, + }, + { + name: "0 percent traceID", + cfg: &Config{ + SamplingPercentage: 0, + AttributeSource: traceIDAttributeSource, + }, + tid: defaultTID, + sampled: false, + }, + { + name: "10 percent priority sampled incoming randomness", + cfg: &Config{ + SamplingPercentage: 0, + AttributeSource: traceIDAttributeSource, + SamplingPriority: "veryrandom", + SamplingPrecision: 6, + }, + tid: defaultTID, + attrs: map[string]any{ + "sampling.randomness": "e6147c00000000", + "veryrandom": 10.125, + }, + sampled: true, + adjCount: 9.876654321, + expect: map[string]any{ + "sampling.randomness": "e6147c00000000", + "sampling.threshold": "e6147b", + "veryrandom": 10.125, + }, + }, + { + name: "25 percent incoming", + cfg: &Config{ + SamplingPercentage: 50, + AttributeSource: traceIDAttributeSource, + Mode: Proportional, + }, + tid: mustParseTID("fefefefefefefefefef0000000000000"), + attrs: map[string]any{ + "sampling.threshold": "c", + }, + sampled: true, + adjCount: 8, + expect: map[string]any{ + "sampling.threshold": "e", + }, + }, + { + name: "25 percent arriving inconsistent", + cfg: &Config{ + SamplingPercentage: 50, + AttributeSource: traceIDAttributeSource, + Mode: Equalizing, + FailClosed: true, + }, + tid: mustParseTID("fefefefefefefefefeb0000000000000"), + attrs: map[string]any{ + // "c" is an invalid threshold for the TraceID + // i.e., T <= R is false, should be rejected. + "sampling.threshold": "c", // Corresponds with 25% + }, + log: "inconsistent arriving threshold", + sampled: false, + }, + { + name: "25 percent arriving equalizing", + cfg: &Config{ + SamplingPercentage: 50, + AttributeSource: traceIDAttributeSource, + Mode: Equalizing, + SamplingPriority: "prio", + }, + tid: mustParseTID("fefefefefefefefefefefefefefefefe"), + attrs: map[string]any{ + "sampling.threshold": "c", // Corresponds with 25% + "prio": 37, // Lower than 50, higher than 25 + }, + sampled: true, + adjCount: 4, + expect: map[string]any{ + "sampling.threshold": "c", + "prio": int64(37), + }, + log: "cannot raise existing sampling probability", + }, + { + name: "hash_seed with spec randomness", + cfg: &Config{ + SamplingPercentage: 100, + AttributeSource: traceIDAttributeSource, + Mode: HashSeed, + }, + tid: defaultTID, + attrs: map[string]any{ + "sampling.randomness": "f2341234123412", + }, + sampled: true, + adjCount: 0, // No threshold + log: "item has sampling randomness", + expect: map[string]any{ + "sampling.randomness": "f2341234123412", + }, + }, + } + for _, tt := range tests { + t.Run(fmt.Sprint(tt.name), func(t *testing.T) { - ctx := context.Background() - logs := plog.NewLogs() - record := logs.ResourceLogs().AppendEmpty().ScopeLogs().AppendEmpty().LogRecords().AppendEmpty() - record.SetTraceID(pcommon.TraceID{}) // invalid TraceID - - cfg := &Config{ - SamplingPercentage: tt.pct, - HashSeed: defaultHashSeed, - FailClosed: tt.failClosed, - AttributeSource: tt.source, - FromAttribute: "unused", + sink := new(consumertest.LogsSink) + cfg := &Config{} + if tt.cfg != nil { + *cfg = *tt.cfg } - sink := new(consumertest.LogsSink) set := processortest.NewNopSettings() - // Note: there is a debug-level log we are expecting when FailClosed - // causes a drop. logger, observed := observer.New(zap.DebugLevel) set.Logger = zap.New(logger) - lp, err := newLogsProcessor(ctx, set, sink, cfg) + tsp, err := newLogsProcessor(context.Background(), set, sink, cfg) require.NoError(t, err) - err = lp.ConsumeLogs(ctx, logs) + logs := plog.NewLogs() + lr := logs.ResourceLogs().AppendEmpty().ScopeLogs().AppendEmpty().LogRecords() + record := lr.AppendEmpty() + record.SetTimestamp(pcommon.Timestamp(time.Unix(1649400860, 0).Unix())) + record.SetSeverityNumber(plog.SeverityNumberDebug) + record.SetTraceID(tt.tid) + require.NoError(t, record.Attributes().FromRaw(tt.attrs)) + + err = tsp.ConsumeLogs(context.Background(), logs) require.NoError(t, err) + if len(tt.log) == 0 { + require.Equal(t, 0, len(observed.All()), "should not have logs: %v", observed.All()) + require.Equal(t, "", tt.log) + } else { + require.Equal(t, 1, len(observed.All()), "should have one log: %v", observed.All()) + require.Contains(t, observed.All()[0].Message, "logs sampler") + require.Contains(t, observed.All()[0].Context[0].Interface.(error).Error(), tt.log) + } + sampledData := sink.AllLogs() + if tt.sampled { require.Equal(t, 1, len(sampledData)) assert.Equal(t, 1, sink.LogRecordCount()) - } else { - require.Equal(t, 0, len(sampledData)) - assert.Equal(t, 0, sink.LogRecordCount()) - } - - if tt.pct != 0 { - // pct==0 bypasses the randomness check - require.Equal(t, 1, len(observed.All()), "should have one log: %v", observed.All()) - require.Contains(t, observed.All()[0].Message, "logs sampler") - require.Contains(t, observed.All()[0].Context[0].Interface.(error).Error(), "missing randomness") - } else { - require.Equal(t, 0, len(observed.All()), "should have no logs: %v", observed.All()) + got := sink.AllLogs()[0].ResourceLogs().At(0).ScopeLogs().At(0).LogRecords().At(0) + gotAttrs := got.Attributes() + require.Equal(t, tt.expect, gotAttrs.AsRaw()) + thVal, hasTh := gotAttrs.Get("sampling.threshold") + if tt.adjCount == 0 { + require.False(t, hasTh) + } else { + th, err := sampling.TValueToThreshold(thVal.Str()) + require.NoError(t, err) + if cfg.SamplingPrecision == 0 { + assert.InEpsilon(t, tt.adjCount, th.AdjustedCount(), 1e-9, + "compare %v %v", tt.adjCount, th.AdjustedCount()) + } else { + assert.InEpsilon(t, tt.adjCount, th.AdjustedCount(), 1e-3, + "compare %v %v", tt.adjCount, th.AdjustedCount()) + } + } } }) } } + +func TestLogsMissingRandomness(t *testing.T) { + type test struct { + pct float32 + source AttributeSource + failClosed bool + sampled bool + } + + for _, mode := range AllModes { + for _, tt := range []test{ + {0, recordAttributeSource, true, false}, + {50, recordAttributeSource, true, false}, + {100, recordAttributeSource, true, false}, + + {0, recordAttributeSource, false, false}, + {50, recordAttributeSource, false, true}, + {100, recordAttributeSource, false, true}, + + {0, traceIDAttributeSource, true, false}, + {50, traceIDAttributeSource, true, false}, + {100, traceIDAttributeSource, true, false}, + + {0, traceIDAttributeSource, false, false}, + {50, traceIDAttributeSource, false, true}, + {100, traceIDAttributeSource, false, true}, + } { + t.Run(fmt.Sprint(tt.pct, "_", tt.source, "_", tt.failClosed, "_", mode), func(t *testing.T) { + + ctx := context.Background() + logs := plog.NewLogs() + record := logs.ResourceLogs().AppendEmpty().ScopeLogs().AppendEmpty().LogRecords().AppendEmpty() + record.SetTraceID(pcommon.TraceID{}) // invalid TraceID + record.Attributes().PutStr("unused", "") + + cfg := &Config{ + SamplingPercentage: tt.pct, + Mode: mode, + HashSeed: defaultHashSeed, + FailClosed: tt.failClosed, + AttributeSource: tt.source, + FromAttribute: "unused", + } + + sink := new(consumertest.LogsSink) + set := processortest.NewNopSettings() + // Note: there is a debug-level log we are expecting when FailClosed + // causes a drop. + logger, observed := observer.New(zap.DebugLevel) + set.Logger = zap.New(logger) + + lp, err := newLogsProcessor(ctx, set, sink, cfg) + require.NoError(t, err) + + err = lp.ConsumeLogs(ctx, logs) + require.NoError(t, err) + + sampledData := sink.AllLogs() + if tt.sampled { + require.Equal(t, 1, len(sampledData)) + assert.Equal(t, 1, sink.LogRecordCount()) + } else { + require.Equal(t, 0, len(sampledData)) + assert.Equal(t, 0, sink.LogRecordCount()) + } + + if tt.pct != 0 { + // pct==0 bypasses the randomness check + require.Equal(t, 1, len(observed.All()), "should have one log: %v", observed.All()) + require.Contains(t, observed.All()[0].Message, "logs sampler") + require.Contains(t, observed.All()[0].Context[0].Interface.(error).Error(), "missing randomness") + } else { + require.Equal(t, 0, len(observed.All()), "should have no logs: %v", observed.All()) + } + }) + } + } +} diff --git a/processor/probabilisticsamplerprocessor/sampler_mode.go b/processor/probabilisticsamplerprocessor/sampler_mode.go index 6bf09caa271f..377f717bed09 100644 --- a/processor/probabilisticsamplerprocessor/sampler_mode.go +++ b/processor/probabilisticsamplerprocessor/sampler_mode.go @@ -19,6 +19,16 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling" ) +const ( + // These four can happen at runtime and be returned by + // randomnessFromXXX() + + ErrInconsistentArrivingTValue samplerError = "inconsistent arriving threshold: item should not have been sampled" + ErrMissingRandomness samplerError = "missing randomness" + ErrRandomnessInUse samplerError = "item has sampling randomness, equalizing or proportional mode recommended" + ErrThresholdInUse samplerError = "item has sampling threshold, equalizing or proportional mode recommended" +) + const ( // Hashing method: The constants below help translate user friendly percentages // to numbers direct used in sampling. @@ -28,22 +38,40 @@ const ( percentageScaleFactor = numHashBuckets / 100.0 ) -// SamplerMode controls the logic used in making a sampling decision. -// The HashSeed mode is the only mode, presently, and it is also the -// default mode. -// -// TODO: In the future, when OTEP 235 is introduced, there will be two -// new modes. +// samplerErrors are conditions reported by the sampler that are somewhat +// ordinary and should log as info-level. +type samplerError string + +var _ error = samplerError("") + +func (s samplerError) Error() string { + return string(s) +} + +// SamplerMode determines which of several modes is used for the +// sampling decision. type SamplerMode string const ( - HashSeed SamplerMode = "hash_seed" - DefaultMode SamplerMode = HashSeed - modeUnset SamplerMode = "" -) + // HashSeed applies the hash/fnv hash function originally used in this component. + HashSeed SamplerMode = "hash_seed" + + // Equalizing uses OpenTelemetry consistent probability + // sampling information (OTEP 235), applies an absolute + // threshold to equalize incoming sampling probabilities. + Equalizing SamplerMode = "equalizing" + + // Proportional uses OpenTelemetry consistent probability + // sampling information (OTEP 235), multiplies incoming + // sampling probaiblities. + Proportional SamplerMode = "proportional" -// ErrMissingRandomness indicates no randomness source was found. -var ErrMissingRandomness = errors.New("missing randomness") + // defaultHashSeed is applied when the mode is unset. + defaultMode SamplerMode = HashSeed + + // modeUnset indicates the user has not configured the mode. + modeUnset SamplerMode = "" +) type randomnessNamer interface { randomness() sampling.Randomness @@ -57,6 +85,8 @@ func (rm randomnessMethod) randomness() sampling.Randomness { } type traceIDHashingMethod struct{ randomnessMethod } +type traceIDW3CSpecMethod struct{ randomnessMethod } +type samplingRandomnessMethod struct{ randomnessMethod } type samplingPriorityMethod struct{ randomnessMethod } type missingRandomnessMethod struct{} @@ -82,12 +112,22 @@ func (traceIDHashingMethod) policyName() string { return "trace_id_hash" } +func (samplingRandomnessMethod) policyName() string { + return "sampling_randomness" +} + +func (traceIDW3CSpecMethod) policyName() string { + return "trace_id_w3c" +} + func (samplingPriorityMethod) policyName() string { return "sampling_priority" } var _ randomnessNamer = missingRandomnessMethod{} var _ randomnessNamer = traceIDHashingMethod{} +var _ randomnessNamer = traceIDW3CSpecMethod{} +var _ randomnessNamer = samplingRandomnessMethod{} var _ randomnessNamer = samplingPriorityMethod{} func newMissingRandomnessMethod() randomnessNamer { @@ -99,6 +139,14 @@ func isMissing(rnd randomnessNamer) bool { return ok } +func newSamplingRandomnessMethod(rnd sampling.Randomness) randomnessNamer { + return samplingRandomnessMethod{randomnessMethod(rnd)} +} + +func newTraceIDW3CSpecMethod(rnd sampling.Randomness) randomnessNamer { + return traceIDW3CSpecMethod{randomnessMethod(rnd)} +} + func newTraceIDHashingMethod(rnd sampling.Randomness) randomnessNamer { return traceIDHashingMethod{randomnessMethod(rnd)} } @@ -114,10 +162,41 @@ func newAttributeHashingMethod(attribute string, rnd sampling.Randomness) random } } -// TODO: Placeholder interface, see #31894 for its future contents, -// will become a non-empty interface. (Linter forces us to write "any".) -type samplingCarrier any +// samplingCarrier conveys information about the underlying data item +// (whether span or log record) through the sampling decision. +type samplingCarrier interface { + // explicitRandomness returns a randomness value and a boolean + // indicating whether the item had sampling randomness + // explicitly set. + explicitRandomness() (randomnessNamer, bool) + + // setExplicitRandomness updates the item with the signal-specific + // encoding for an explicit randomness value. + setExplicitRandomness(randomnessNamer) + + // clearThreshold unsets a sampling threshold, which is used to + // clear information that breaks the expected sampling invariants + // described in OTEP 235. + clearThreshold() + + // threshold returns a sampling threshold and a boolean + // indicating whether the item had sampling threshold + // explicitly set. + threshold() (sampling.Threshold, bool) + + // updateThreshold modifies the sampling threshold. This + // returns an error if the updated sampling threshold has a + // lower adjusted account; the only permissible updates raise + // adjusted count (i.e., reduce sampling probability). + updateThreshold(sampling.Threshold) error + + // reserialize re-encodes the updated sampling information + // into the item, if necessary. For Spans, this re-encodes + // the tracestate. This is a no-op for logs records. + reserialize() error +} +// dataSampler implements the logic of a sampling mode. type dataSampler interface { // decide reports the result based on a probabilistic decision. decide(carrier samplingCarrier) sampling.Threshold @@ -129,11 +208,11 @@ type dataSampler interface { randomnessFromLogRecord(s plog.LogRecord) (randomness randomnessNamer, carrier samplingCarrier, err error) } -var AllModes = []SamplerMode{HashSeed} - func (sm *SamplerMode) UnmarshalText(in []byte) error { switch mode := SamplerMode(in); mode { case HashSeed, + Equalizing, + Proportional, modeUnset: *sm = mode return nil @@ -161,6 +240,12 @@ func (th *hashingSampler) decide(_ samplingCarrier) sampling.Threshold { return th.tvalueThreshold } +// consistentTracestateCommon contains the common aspects of the +// Proportional and Equalizing sampler modes. These samplers sample +// using the TraceID and do not support use of logs source attribute. +type consistentTracestateCommon struct { +} + // neverSampler always decides false. type neverSampler struct { } @@ -169,6 +254,52 @@ func (*neverSampler) decide(_ samplingCarrier) sampling.Threshold { return sampling.NeverSampleThreshold } +// equalizingSampler raises thresholds up to a fixed value. +type equalizingSampler struct { + // TraceID-randomness-based calculation + tvalueThreshold sampling.Threshold + + consistentTracestateCommon +} + +func (te *equalizingSampler) decide(carrier samplingCarrier) sampling.Threshold { + if tv, has := carrier.threshold(); has && sampling.ThresholdLessThan(te.tvalueThreshold, tv) { + return tv + } + return te.tvalueThreshold +} + +// proportionalSampler raises thresholds relative to incoming value. +type proportionalSampler struct { + // ratio in the range [2**-56, 1] + ratio float64 + + // precision is the precision in number of hex digits + precision int + + consistentTracestateCommon +} + +func (tp *proportionalSampler) decide(carrier samplingCarrier) sampling.Threshold { + incoming := 1.0 + if tv, has := carrier.threshold(); has { + incoming = tv.Probability() + } + + // There is a potential here for the product probability to + // underflow, which is checked here. + threshold, err := sampling.ProbabilityToThresholdWithPrecision(incoming*tp.ratio, tp.precision) + + // Check the only known error condition. + if errors.Is(err, sampling.ErrProbabilityRange) { + // Considered valid, a case where the sampling probability + // has fallen below the minimum supported value and simply + // becomes unsampled. + return sampling.NeverSampleThreshold + } + return threshold +} + func getBytesFromValue(value pcommon.Value) []byte { if value.Type() == pcommon.ValueTypeBytes { return value.Bytes().AsRaw() @@ -214,13 +345,28 @@ func randomnessFromBytes(b []byte, hashSeed uint32) sampling.Randomness { return rnd } -// consistencyCheck checks for certain inconsistent inputs. -// -// if the randomness is missing, returns ErrMissingRandomness. -func consistencyCheck(rnd randomnessNamer, _ samplingCarrier) error { +func consistencyCheck(rnd randomnessNamer, carrier samplingCarrier) error { + // Without randomness, do not check the threshold. if isMissing(rnd) { return ErrMissingRandomness } + // When the carrier is nil, it means there was trouble parsing the + // tracestate or trace-related attributes. In this case, skip the + // consistency check. + if carrier == nil { + return nil + } + // Consistency check: if the TraceID is out of range, the + // TValue is a lie. If inconsistent, clear it and return an error. + if tv, has := carrier.threshold(); has { + if !tv.ShouldSample(rnd.randomness()) { + // In case we fail open, the threshold is cleared as + // recommended in the OTel spec. + carrier.clearThreshold() + return ErrInconsistentArrivingTValue + } + } + return nil } @@ -230,46 +376,82 @@ func consistencyCheck(rnd randomnessNamer, _ samplingCarrier) error { // // Extending this logic, we round very small probabilities up to the // minimum supported value(s) which varies according to sampler mode. -func makeSampler(cfg *Config) dataSampler { +func makeSampler(cfg *Config, isLogs bool) dataSampler { // README allows percents >100 to equal 100%. pct := cfg.SamplingPercentage if pct > 100 { pct = 100 } - - never := &neverSampler{} + mode := cfg.Mode + if mode == modeUnset { + // Reasons to choose the legacy behavior include: + // (a) having set the hash seed + // (b) logs signal w/o trace ID source + if cfg.HashSeed != 0 || (isLogs && cfg.AttributeSource != traceIDAttributeSource) { + mode = HashSeed + } else { + mode = defaultMode + } + } if pct == 0 { - return never + return &neverSampler{} + } + // Note: Convert to float64 before dividing by 100, otherwise loss of precision. + // If the probability is too small, round it up to the minimum. + ratio := float64(pct) / 100 + // Like the pct > 100 test above, but for values too small to + // express in 14 bits of precision. + if ratio < sampling.MinSamplingProbability { + ratio = sampling.MinSamplingProbability } - // Note: the original hash function used in this code - // is preserved to ensure consistency across updates. - // - // uint32(pct * percentageScaleFactor) - // - // (a) carried out the multiplication in 32-bit precision - // (b) rounded to zero instead of nearest. - scaledSampleRate := uint32(pct * percentageScaleFactor) + switch mode { + case Equalizing: + // The error case below is ignored, we have rounded the probability so + // that it is in-range + threshold, _ := sampling.ProbabilityToThresholdWithPrecision(ratio, cfg.SamplingPrecision) - if scaledSampleRate == 0 { - return never - } + return &equalizingSampler{ + tvalueThreshold: threshold, + } + + case Proportional: + return &proportionalSampler{ + ratio: ratio, + precision: cfg.SamplingPrecision, + } + + default: // i.e., HashSeed + + // Note: the original hash function used in this code + // is preserved to ensure consistency across updates. + // + // uint32(pct * percentageScaleFactor) + // + // (a) carried out the multiplication in 32-bit precision + // (b) rounded to zero instead of nearest. + scaledSamplerate := uint32(pct * percentageScaleFactor) + + if scaledSamplerate == 0 { + return &neverSampler{} + } - // Convert the accept threshold to a reject threshold, - // then shift it into 56-bit value. - reject := numHashBuckets - scaledSampleRate - reject56 := uint64(reject) << 42 + // Convert the accept threshold to a reject threshold, + // then shift it into 56-bit value. + reject := numHashBuckets - scaledSamplerate + reject56 := uint64(reject) << 42 - threshold, _ := sampling.UnsignedToThreshold(reject56) + threshold, _ := sampling.UnsignedToThreshold(reject56) - return &hashingSampler{ - tvalueThreshold: threshold, - hashSeed: cfg.HashSeed, + return &hashingSampler{ + tvalueThreshold: threshold, + hashSeed: cfg.HashSeed, - // Logs specific: - logsTraceIDEnabled: cfg.AttributeSource == traceIDAttributeSource, - logsRandomnessSourceAttribute: cfg.FromAttribute, + // Logs specific: + logsTraceIDEnabled: cfg.AttributeSource == traceIDAttributeSource, + logsRandomnessSourceAttribute: cfg.FromAttribute, + } } } @@ -279,7 +461,7 @@ type randFunc[T any] func(T) (randomnessNamer, samplingCarrier, error) // priorityFunc makes changes resulting from sampling priority. type priorityFunc[T any] func(T, randomnessNamer, sampling.Threshold) (randomnessNamer, sampling.Threshold) -// commonSamplingLogic implements sampling on a per-item basis +// commonShouldSampleLogic implements sampling on a per-item basis // independent of the signal type, as embodied in the functional // parameters: func commonShouldSampleLogic[T any]( @@ -293,12 +475,18 @@ func commonShouldSampleLogic[T any]( logger *zap.Logger, ) bool { rnd, carrier, err := randFunc(item) + if err == nil { err = consistencyCheck(rnd, carrier) } var threshold sampling.Threshold if err != nil { - logger.Debug(description, zap.Error(err)) + var se samplerError + if errors.As(err, &se) { + logger.Debug(description, zap.Error(err)) + } else { + logger.Info(description, zap.Error(err)) + } if failClosed { threshold = sampling.NeverSampleThreshold } else { @@ -312,6 +500,24 @@ func commonShouldSampleLogic[T any]( sampled := threshold.ShouldSample(rnd.randomness()) + if sampled && carrier != nil { + // Note: updateThreshold limits loss of adjusted count, by + // preventing the threshold from being lowered, only allowing + // probability to fall and never to rise. + if err := carrier.updateThreshold(threshold); err != nil { + if errors.Is(err, sampling.ErrInconsistentSampling) { + // This is working-as-intended. You can't lower + // the threshold, it's illogical. + logger.Debug(description, zap.Error(err)) + } else { + logger.Info(description, zap.Error(err)) + } + } + if err := carrier.reserialize(); err != nil { + logger.Info(description, zap.Error(err)) + } + } + _ = stats.RecordWithTags( ctx, []tag.Mutator{tag.Upsert(tagPolicyKey, rnd.policyName()), tag.Upsert(tagSampledKey, strconv.FormatBool(sampled))}, diff --git a/processor/probabilisticsamplerprocessor/sampler_mode_test.go b/processor/probabilisticsamplerprocessor/sampler_mode_test.go index 170da3ed6d44..d0a2aef2a472 100644 --- a/processor/probabilisticsamplerprocessor/sampler_mode_test.go +++ b/processor/probabilisticsamplerprocessor/sampler_mode_test.go @@ -4,12 +4,15 @@ package probabilisticsamplerprocessor import ( + "math" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) +var AllModes = []SamplerMode{HashSeed, Equalizing, Proportional} + func TestUnmarshalText(t *testing.T) { tests := []struct { samplerMode string @@ -18,6 +21,12 @@ func TestUnmarshalText(t *testing.T) { { samplerMode: "hash_seed", }, + { + samplerMode: "equalizing", + }, + { + samplerMode: "proportional", + }, { samplerMode: "", }, @@ -39,3 +48,25 @@ func TestUnmarshalText(t *testing.T) { }) } } + +func TestHashSeedRoundingDown(t *testing.T) { + // The original hash function rounded thresholds down, in the + // direction of zero. + + // pct is approximately 75% of the minimum 14-bit probability, so it + // would round up, but it does not. + const pct = 0x3p-16 * 100 + + require.Equal(t, 1.0, math.Round((pct/100)*numHashBuckets)) + + for _, isLogs := range []bool{false, true} { + cfg := Config{ + Mode: HashSeed, + SamplingPercentage: pct, + HashSeed: defaultHashSeed, + } + + _, ok := makeSampler(&cfg, isLogs).(*neverSampler) + require.True(t, ok, "is neverSampler") + } +} diff --git a/processor/probabilisticsamplerprocessor/testdata/config.yaml b/processor/probabilisticsamplerprocessor/testdata/config.yaml index 0e853f77cbe3..6adc453015a3 100644 --- a/processor/probabilisticsamplerprocessor/testdata/config.yaml +++ b/processor/probabilisticsamplerprocessor/testdata/config.yaml @@ -11,6 +11,8 @@ processors: # zero, i.e.: no sample. Values greater or equal 100 are treated as # "sample all traces". sampling_percentage: 15.3 + # mode determines the type of sampling logic applied, see the README for details. + mode: "proportional" probabilistic_sampler/logs: # the percentage rate at which logs are going to be sampled. Defaults to diff --git a/processor/probabilisticsamplerprocessor/testdata/invalid_inf.yaml b/processor/probabilisticsamplerprocessor/testdata/invalid_inf.yaml new file mode 100644 index 000000000000..4ff2ab115142 --- /dev/null +++ b/processor/probabilisticsamplerprocessor/testdata/invalid_inf.yaml @@ -0,0 +1,17 @@ +receivers: + nop: + +processors: + + probabilistic_sampler/traces: + sampling_percentage: +Inf + +exporters: + nop: + +service: + pipelines: + traces: + receivers: [ nop ] + processors: [ probabilistic_sampler/traces ] + exporters: [ nop ] diff --git a/processor/probabilisticsamplerprocessor/testdata/invalid_prec.yaml b/processor/probabilisticsamplerprocessor/testdata/invalid_prec.yaml new file mode 100644 index 000000000000..96d93b6eddc1 --- /dev/null +++ b/processor/probabilisticsamplerprocessor/testdata/invalid_prec.yaml @@ -0,0 +1,18 @@ +receivers: + nop: + +processors: + + probabilistic_sampler/traces: + sampling_percentage: 50 + sampling_precision: 15 + +exporters: + nop: + +service: + pipelines: + traces: + receivers: [ nop ] + processors: [ probabilistic_sampler/traces ] + exporters: [ nop ] diff --git a/processor/probabilisticsamplerprocessor/testdata/invalid_small.yaml b/processor/probabilisticsamplerprocessor/testdata/invalid_small.yaml new file mode 100644 index 000000000000..1f8bdc271f6c --- /dev/null +++ b/processor/probabilisticsamplerprocessor/testdata/invalid_small.yaml @@ -0,0 +1,18 @@ +receivers: + nop: + +processors: + + probabilistic_sampler/traces: + # This is smaller than 2**-56 + sampling_percentage: .000000000000001 + +exporters: + nop: + +service: + pipelines: + traces: + receivers: [ nop ] + processors: [ probabilistic_sampler/traces ] + exporters: [ nop ] diff --git a/processor/probabilisticsamplerprocessor/testdata/invalid_zero.yaml b/processor/probabilisticsamplerprocessor/testdata/invalid_zero.yaml new file mode 100644 index 000000000000..2b80e340b64b --- /dev/null +++ b/processor/probabilisticsamplerprocessor/testdata/invalid_zero.yaml @@ -0,0 +1,18 @@ +receivers: + nop: + +processors: + + probabilistic_sampler/traces: + sampling_percentage: 15.3 + sampling_precision: 0 + +exporters: + nop: + +service: + pipelines: + traces: + receivers: [ nop ] + processors: [ probabilistic_sampler/traces ] + exporters: [ nop ] diff --git a/processor/probabilisticsamplerprocessor/tracesprocessor.go b/processor/probabilisticsamplerprocessor/tracesprocessor.go index 197e289e9e5d..5a81215ac500 100644 --- a/processor/probabilisticsamplerprocessor/tracesprocessor.go +++ b/processor/probabilisticsamplerprocessor/tracesprocessor.go @@ -6,6 +6,7 @@ package probabilisticsamplerprocessor // import "github.com/open-telemetry/opent import ( "context" "strconv" + "strings" "go.opentelemetry.io/collector/consumer" "go.opentelemetry.io/collector/pdata/pcommon" @@ -47,14 +48,51 @@ type traceProcessor struct { // decide. type tracestateCarrier struct { span ptrace.Span + sampling.W3CTraceState } var _ samplingCarrier = &tracestateCarrier{} -func newTracestateCarrier(s ptrace.Span) samplingCarrier { - return &tracestateCarrier{ +func newTracestateCarrier(s ptrace.Span) (samplingCarrier, error) { + var err error + tsc := &tracestateCarrier{ span: s, } + tsc.W3CTraceState, err = sampling.NewW3CTraceState(s.TraceState().AsRaw()) + return tsc, err +} + +func (tc *tracestateCarrier) threshold() (sampling.Threshold, bool) { + return tc.W3CTraceState.OTelValue().TValueThreshold() +} + +func (tc *tracestateCarrier) explicitRandomness() (randomnessNamer, bool) { + rnd, ok := tc.W3CTraceState.OTelValue().RValueRandomness() + if !ok { + return newMissingRandomnessMethod(), false + } + return newSamplingRandomnessMethod(rnd), true +} + +func (tc *tracestateCarrier) updateThreshold(th sampling.Threshold) error { + return tc.W3CTraceState.OTelValue().UpdateTValueWithSampling(th) +} + +func (tc *tracestateCarrier) setExplicitRandomness(rnd randomnessNamer) { + tc.W3CTraceState.OTelValue().SetRValue(rnd.randomness()) +} + +func (tc *tracestateCarrier) clearThreshold() { + tc.W3CTraceState.OTelValue().ClearTValue() +} + +func (tc *tracestateCarrier) reserialize() error { + var w strings.Builder + err := tc.W3CTraceState.Serialize(&w) + if err == nil { + tc.span.TraceState().FromRaw(w.String()) + } + return err } // newTracesProcessor returns a processor.TracesProcessor that will @@ -62,7 +100,7 @@ func newTracestateCarrier(s ptrace.Span) samplingCarrier { // configuration. func newTracesProcessor(ctx context.Context, set processor.Settings, cfg *Config, nextConsumer consumer.Traces) (processor.Traces, error) { tp := &traceProcessor{ - sampler: makeSampler(cfg), + sampler: makeSampler(cfg, false), failClosed: cfg.FailClosed, logger: set.Logger, } @@ -75,21 +113,56 @@ func newTracesProcessor(ctx context.Context, set processor.Settings, cfg *Config processorhelper.WithCapabilities(consumer.Capabilities{MutatesData: true})) } -func (th *neverSampler) randomnessFromSpan(_ ptrace.Span) (randomnessNamer, samplingCarrier, error) { - // We return a fake randomness value, since it will not be used. - // This avoids a consistency check error for missing randomness. - return newSamplingPriorityMethod(sampling.AllProbabilitiesRandomness), nil, nil -} - func (th *hashingSampler) randomnessFromSpan(s ptrace.Span) (randomnessNamer, samplingCarrier, error) { tid := s.TraceID() - tsc := newTracestateCarrier(s) + tsc, err := newTracestateCarrier(s) rnd := newMissingRandomnessMethod() if !tid.IsEmpty() { rnd = newTraceIDHashingMethod(randomnessFromBytes(tid[:], th.hashSeed)) } - return rnd, tsc, nil + + // If the tracestate contains a proper R-value or T-value, we + // have to leave it alone. The user should not be using this + // sampler mode if they are using specified forms of consistent + // sampling in OTel. + if err != nil { + return rnd, nil, err + } else if _, has := tsc.explicitRandomness(); has { + err = ErrRandomnessInUse + tsc = nil + } else if _, has := tsc.threshold(); has { + err = ErrThresholdInUse + tsc = nil + } else { + // When no sampling information is present, add a + // Randomness value. + tsc.setExplicitRandomness(rnd) + } + return rnd, tsc, err } + +func (ctc *consistentTracestateCommon) randomnessFromSpan(s ptrace.Span) (randomnessNamer, samplingCarrier, error) { + rnd := newMissingRandomnessMethod() + tsc, err := newTracestateCarrier(s) + if err != nil { + tsc = nil + } else if rv, has := tsc.explicitRandomness(); has { + // When the tracestate is OK and has r-value, use it. + rnd = rv + } else if !s.TraceID().IsEmpty() { + rnd = newTraceIDW3CSpecMethod(sampling.TraceIDToRandomness(s.TraceID())) + } + + return rnd, tsc, err +} + +func (th *neverSampler) randomnessFromSpan(span ptrace.Span) (randomnessNamer, samplingCarrier, error) { + // We return a fake randomness value, since it will not be used. + // This avoids a consistency check error for missing randomness. + tsc, err := newTracestateCarrier(span) + return newSamplingPriorityMethod(sampling.AllProbabilitiesRandomness), tsc, err +} + func (tp *traceProcessor) processTraces(ctx context.Context, td ptrace.Traces) (ptrace.Traces, error) { td.ResourceSpans().RemoveIf(func(rs ptrace.ResourceSpans) bool { rs.ScopeSpans().RemoveIf(func(ils ptrace.ScopeSpans) bool { diff --git a/processor/probabilisticsamplerprocessor/tracesprocessor_test.go b/processor/probabilisticsamplerprocessor/tracesprocessor_test.go index d46b13035ae5..608296e94e4c 100644 --- a/processor/probabilisticsamplerprocessor/tracesprocessor_test.go +++ b/processor/probabilisticsamplerprocessor/tracesprocessor_test.go @@ -23,6 +23,7 @@ import ( "go.uber.org/zap/zaptest/observer" "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/idutils" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling" ) // defaultHashSeed is used throughout to ensure that the HashSeed is real @@ -105,16 +106,16 @@ func Test_tracesamplerprocessor_SamplingPercentageRange(t *testing.T) { }, numBatches: 1e5, numTracesPerBatch: 2, - acceptableDelta: 0.01, + acceptableDelta: 0.02, }, { name: "random_sampling_small", cfg: &Config{ SamplingPercentage: 5, }, - numBatches: 1e5, + numBatches: 1e6, numTracesPerBatch: 2, - acceptableDelta: 0.01, + acceptableDelta: 0.1, }, { name: "random_sampling_medium", @@ -123,7 +124,7 @@ func Test_tracesamplerprocessor_SamplingPercentageRange(t *testing.T) { }, numBatches: 1e5, numTracesPerBatch: 4, - acceptableDelta: 0.1, + acceptableDelta: 0.2, }, { name: "random_sampling_high", @@ -148,6 +149,7 @@ func Test_tracesamplerprocessor_SamplingPercentageRange(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { sink := newAssertTraces(t, testSvcName) + tsp, err := newTracesProcessor(context.Background(), processortest.NewNopSettings(), tt.cfg, sink) if err != nil { t.Errorf("error when creating traceSamplerProcessor: %v", err) @@ -383,28 +385,35 @@ func Test_tracesamplerprocessor_SpanSamplingPriority(t *testing.T) { sampled: true, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { + for _, mode := range AllModes { + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { - sink := new(consumertest.TracesSink) - cfg := *tt.cfg + sink := new(consumertest.TracesSink) - cfg.HashSeed = defaultHashSeed - tsp, err := newTracesProcessor(context.Background(), processortest.NewNopSettings(), &cfg, sink) - require.NoError(t, err) + cfg := &Config{} + if tt.cfg != nil { + *cfg = *tt.cfg + } + cfg.Mode = mode + cfg.HashSeed = defaultHashSeed - err = tsp.ConsumeTraces(context.Background(), tt.td) - require.NoError(t, err) + tsp, err := newTracesProcessor(context.Background(), processortest.NewNopSettings(), cfg, sink) + require.NoError(t, err) - sampledData := sink.AllTraces() - if tt.sampled { - require.Equal(t, 1, len(sampledData)) - assert.Equal(t, 1, sink.SpanCount()) - } else { - require.Equal(t, 0, len(sampledData)) - assert.Equal(t, 0, sink.SpanCount()) - } - }) + err = tsp.ConsumeTraces(context.Background(), tt.td) + require.NoError(t, err) + + sampledData := sink.AllTraces() + if tt.sampled { + require.Equal(t, 1, len(sampledData)) + assert.Equal(t, 1, sink.SpanCount()) + } else { + require.Equal(t, 0, len(sampledData)) + assert.Equal(t, 0, sink.SpanCount()) + } + }) + } } } @@ -489,6 +498,632 @@ func Test_parseSpanSamplingPriority(t *testing.T) { } } +// Test_tracesamplerprocessor_TraceState checks if handling of the context +// tracestate is correct with a number o cases that exercise the two +// consistent sampling modes. +func Test_tracesamplerprocessor_TraceState(t *testing.T) { + // This hard-coded TraceID will sample at 50% and not at 49%. + // The equivalent randomness is 0x80000000000000. + var defaultTID = mustParseTID("fefefefefefefefefe80000000000000") + + // improbableTraceID will sample at all supported probabilities. In + // hex, the leading 18 digits do not matter, the trailing 14 are all `f`. + var improbableTraceID = mustParseTID("111111111111111111ffffffffffffff") + + sid := idutils.UInt64ToSpanID(0xfefefefe) + tests := []struct { + name string + tid pcommon.TraceID + cfg *Config + ts string + key string + value pcommon.Value + log string + sf func(SamplerMode) (sampled bool, adjCount float64, tracestate string) + }{ + { + name: "100 percent", + cfg: &Config{ + SamplingPercentage: 100, + }, + ts: "", + sf: func(SamplerMode) (bool, float64, string) { + return true, 1, "ot=th:0" + }, + }, + { + name: "50 percent sampled", + cfg: &Config{ + SamplingPercentage: 50, + }, + ts: "", + sf: func(SamplerMode) (bool, float64, string) { return true, 2, "ot=th:8" }, + }, + { + name: "25 percent sampled", + tid: mustParseTID("ddddddddddddddddddc0000000000000"), + cfg: &Config{ + SamplingPercentage: 25, + }, + ts: "", + sf: func(SamplerMode) (bool, float64, string) { return true, 4, "ot=th:c" }, + }, + { + name: "25 percent unsampled", + tid: mustParseTID("ddddddddddddddddddb0000000000000"), + cfg: &Config{ + SamplingPercentage: 25, + }, + ts: "", + sf: func(SamplerMode) (bool, float64, string) { return false, 0, "" }, + }, + { + name: "1 percent sampled", + cfg: &Config{ + SamplingPercentage: 1, + SamplingPrecision: 0, + }, + // 99/100 = .fd70a3d70a3d70a3d + ts: "ot=rv:FD70A3D70A3D71", // note upper case passes through, is not generated + sf: func(SamplerMode) (bool, float64, string) { + return true, 1 / 0.01, "ot=rv:FD70A3D70A3D71;th:fd70a3d70a3d71" + }, + }, + { + // with precision 4, the 1% probability rounds down and the + // exact R-value here will sample. see below, where the + // opposite is true. + name: "1 percent sampled with rvalue and precision 4", + cfg: &Config{ + SamplingPercentage: 1, + SamplingPrecision: 4, + }, + ts: "ot=rv:FD70A3D70A3D71", + sf: func(SamplerMode) (bool, float64, string) { + return true, 1 / 0.01, "ot=rv:FD70A3D70A3D71;th:fd70a" + }, + }, + { + // at precision 3, the 1% probability rounds + // up to fd71 and so this does not sample. + // see above, where the opposite is true. + name: "1 percent sampled with rvalue and precision 3", + cfg: &Config{ + SamplingPercentage: 1, + SamplingPrecision: 3, + }, + ts: "ot=rv:FD70A3D70A3D71", + sf: func(SamplerMode) (bool, float64, string) { + return false, 0, "" + }, + }, + { + name: "1 percent not sampled with rvalue", + cfg: &Config{ + SamplingPercentage: 1, + }, + // this r-value is slightly below the t-value threshold, + // off-by-one compared with the case above in the least- + // significant digit. + ts: "ot=rv:FD70A3D70A3D70", + }, + { + name: "49 percent not sampled with default tid", + cfg: &Config{ + SamplingPercentage: 49, + }, + }, + { + name: "1 percent sampled with rvalue", + cfg: &Config{ + SamplingPercentage: 1, + }, + // 99/100 = .FD70A3D70A3D70A3D + ts: "ot=rv:fd70B000000000", + sf: func(SamplerMode) (bool, float64, string) { + return true, 1 / 0.01, "ot=rv:fd70B000000000;th:fd70a3d70a3d71" + }, + }, + { + name: "1 percent sampled with tid", + tid: mustParseTID("a0a0a0a0a0a0a0a0a0fe000000000000"), + cfg: &Config{ + SamplingPercentage: 1, + SamplingPrecision: 4, + }, + // 99/100 = .FD70A3D70A3D70A3D + ts: "", + sf: func(SamplerMode) (bool, float64, string) { + return true, 1 / 0.01, "ot=th:fd70a" + }, + }, + { + name: "sampled by priority", + cfg: &Config{ + SamplingPercentage: 1, + }, + ts: "", + key: "sampling.priority", + value: pcommon.NewValueInt(2), + sf: func(SamplerMode) (bool, float64, string) { return true, 1, "ot=th:0" }, + }, + { + name: "not sampled by priority", + cfg: &Config{ + SamplingPercentage: 99, + }, + ts: "", + key: "sampling.priority", + value: pcommon.NewValueInt(0), + }, + { + name: "incoming 50 percent with rvalue", + cfg: &Config{ + SamplingPercentage: 50, + }, + ts: "ot=rv:90000000000000;th:80000000000000", // note extra zeros in th are erased + sf: func(mode SamplerMode) (bool, float64, string) { + if mode == Equalizing { + return true, 2, "ot=rv:90000000000000;th:8" + } + // Proportionally, 50% less is 25% absolute sampling + return false, 0, "" + }, + }, + { + name: "incoming 50 percent at 25 percent not sampled", + cfg: &Config{ + SamplingPercentage: 25, + }, + ts: "ot=th:8", // 50% + sf: func(SamplerMode) (bool, float64, string) { + return false, 0, "" + }, + }, + { + name: "incoming 50 percent at 25 percent sampled", + cfg: &Config{ + SamplingPercentage: 25, + }, + tid: mustParseTID("ffffffffffffffffffffffffffffffff"), // always sampled + ts: "ot=th:8", // 50% + sf: func(mode SamplerMode) (bool, float64, string) { + if mode == Equalizing { + return true, 4, "ot=th:c" + } + return true, 8, "ot=th:e" + }, + }, + { + name: "equalizing vs proportional", + cfg: &Config{ + SamplingPercentage: 50, + }, + ts: "ot=rv:c0000000000000;th:8", + sf: func(mode SamplerMode) (bool, float64, string) { + if mode == Equalizing { + return true, 2, "ot=rv:c0000000000000;th:8" + } + return true, 4, "ot=rv:c0000000000000;th:c" + }, + }, + { + name: "inconsistent arriving threshold", + log: "inconsistent arriving threshold", + cfg: &Config{ + SamplingPercentage: 100, + }, + ts: "ot=rv:40000000000000;th:8", + sf: func(SamplerMode) (bool, float64, string) { + return true, 1, "ot=rv:40000000000000;th:0" + }, + }, + { + name: "inconsistent arriving threshold not sampled", + log: "inconsistent arriving threshold", + cfg: &Config{ + SamplingPercentage: 1, + FailClosed: true, + }, + ts: "ot=rv:40000000000000;th:8", + sf: func(SamplerMode) (bool, float64, string) { + return false, 0, "" + }, + }, + { + name: "40 percent precision 3 with rvalue", + cfg: &Config{ + SamplingPercentage: 40, + SamplingPrecision: 3, + }, + ts: "ot=rv:a0000000000000", + sf: func(SamplerMode) (bool, float64, string) { + return true, 1 / 0.4, "ot=rv:a0000000000000;th:99a" + }, + }, + { + name: "arriving 50 percent sampled at 40 percent precision 6 with tid", + cfg: &Config{ + SamplingPercentage: 40, + SamplingPrecision: 6, + }, + tid: mustParseTID("a0a0a0a0a0a0a0a0a0d0000000000000"), + ts: "ot=th:8", // 50% + sf: func(mode SamplerMode) (bool, float64, string) { + if mode == Proportional { + // 5 == 1 / (0.4 * 0.5) + return true, 5, "ot=th:cccccd" + } + // 2.5 == 1 / 0.4 + return true, 2.5, "ot=th:99999a" + }, + }, + { + name: "arriving 50 percent sampled at 40 percent partly sampled", + cfg: &Config{ + SamplingPercentage: 40, + SamplingPrecision: 3, + }, + tid: mustParseTID("a0a0a0a0a0a0a0a0a0b0000000000000"), + ts: "ot=th:8", // 50% + sf: func(mode SamplerMode) (bool, float64, string) { + if mode == Proportional { + return false, 0, "" + } + // 2.5 == 1 / 0.4 + return true, 2.5, "ot=th:99a" + }, + }, + { + name: "arriving 50 percent sampled at 40 percent not sampled", + cfg: &Config{ + SamplingPercentage: 40, + SamplingPrecision: 3, + }, + tid: mustParseTID("a0a0a0a0a0a0a0a0a080000000000000"), + ts: "ot=th:8", // 50% + sf: func(SamplerMode) (bool, float64, string) { + return false, 0, "" + }, + }, + { + name: "200 percent equals 100 percent", + cfg: &Config{ + SamplingPercentage: 200, + }, + ts: "", + sf: func(SamplerMode) (bool, float64, string) { + return true, 1, "ot=th:0" + }, + }, + { + name: "tiny probability rounding", + cfg: &Config{ + SamplingPercentage: 100 * 0x1p-14, + }, + tid: improbableTraceID, + ts: "ot=th:fffc", + sf: func(mode SamplerMode) (bool, float64, string) { + if mode == Equalizing { + return true, 1 << 14, "ot=th:fffc" + } + return true, 1 << 28, "ot=th:fffffff" + }, + }, + { + // Note this test tests a probability value very close + // to the limit near 100.0% expressible in a float32, + // which is how the SamplingPercentage field is declared. + // it's impossible to have 10 significant figures at + // at this extreme. + name: "almost 100pct sampling", + cfg: &Config{ + SamplingPercentage: (1 - 8e-7) * 100, // very close to 100% + SamplingPrecision: 10, // 10 sig figs is impossible + }, + tid: improbableTraceID, + sf: func(SamplerMode) (bool, float64, string) { + // The adjusted count is very close to 1.0. + // The threshold has 8 significant figures. + return true, 1 / (1 - 8e-7), "ot=th:00000cccccccd" + }, + }, + { + name: "probability underflow", + cfg: &Config{ + SamplingPercentage: 0x1p-4, + }, + tid: improbableTraceID, + ts: "ot=th:fffffffffffff8", + sf: func(mode SamplerMode) (bool, float64, string) { + if mode == Equalizing { + return true, 1 << 53, "ot=th:fffffffffffff8" + } + return false, 0, "" + }, + }, + } + for _, tt := range tests { + for _, mode := range []SamplerMode{Equalizing, Proportional} { + t.Run(fmt.Sprint(mode, "_", tt.name), func(t *testing.T) { + + sink := new(consumertest.TracesSink) + cfg := &Config{} + if tt.cfg != nil { + *cfg = *tt.cfg + } + cfg.Mode = mode + cfg.HashSeed = defaultHashSeed + + set := processortest.NewNopSettings() + logger, observed := observer.New(zap.DebugLevel) + set.Logger = zap.New(logger) + + tsp, err := newTracesProcessor(context.Background(), set, cfg, sink) + require.NoError(t, err) + + tid := defaultTID + + if !tt.tid.IsEmpty() { + tid = tt.tid + } + + td := makeSingleSpanWithAttrib(tid, sid, tt.ts, tt.key, tt.value) + + err = tsp.ConsumeTraces(context.Background(), td) + require.NoError(t, err) + + sampledData := sink.AllTraces() + + var expectSampled bool + var expectCount float64 + var expectTS string + if tt.sf != nil { + expectSampled, expectCount, expectTS = tt.sf(mode) + } + if expectSampled { + require.Equal(t, 1, len(sampledData)) + assert.Equal(t, 1, sink.SpanCount()) + got := sink.AllTraces()[0].ResourceSpans().At(0).ScopeSpans().At(0).Spans().At(0) + gotTs, err := sampling.NewW3CTraceState(got.TraceState().AsRaw()) + require.NoError(t, err) + switch { + case expectCount == 0: + assert.Equal(t, 0.0, gotTs.OTelValue().AdjustedCount()) + case cfg.SamplingPrecision == 0: + assert.InEpsilon(t, expectCount, gotTs.OTelValue().AdjustedCount(), 1e-9, + "compare %v %v", expectCount, gotTs.OTelValue().AdjustedCount()) + default: + assert.InEpsilon(t, expectCount, gotTs.OTelValue().AdjustedCount(), 1e-3, + "compare %v %v", expectCount, gotTs.OTelValue().AdjustedCount()) + } + require.Equal(t, expectTS, got.TraceState().AsRaw()) + } else { + require.Equal(t, 0, len(sampledData)) + assert.Equal(t, 0, sink.SpanCount()) + require.Equal(t, "", expectTS) + } + + if len(tt.log) == 0 { + require.Equal(t, 0, len(observed.All()), "should not have logs: %v", observed.All()) + } else { + require.Equal(t, 1, len(observed.All()), "should have one log: %v", observed.All()) + require.Contains(t, observed.All()[0].Message, "traces sampler") + require.Contains(t, observed.All()[0].Context[0].Interface.(error).Error(), tt.log) + } + }) + } + } +} + +// Test_tracesamplerprocessor_TraceStateErrors checks that when +// FailClosed is true, certain spans do not pass, with errors. +func Test_tracesamplerprocessor_TraceStateErrors(t *testing.T) { + defaultTID := mustParseTID("fefefefefefefefefe80000000000000") + sid := idutils.UInt64ToSpanID(0xfefefefe) + tests := []struct { + name string + tid pcommon.TraceID + cfg *Config + ts string + sf func(SamplerMode) string + }{ + { + name: "missing randomness", + cfg: &Config{ + SamplingPercentage: 100, + }, + ts: "", + tid: pcommon.TraceID{}, + sf: func(SamplerMode) string { + return "missing randomness" + }, + }, + { + name: "invalid r-value", + cfg: &Config{ + SamplingPercentage: 100, + }, + tid: defaultTID, + ts: "ot=rv:abababababababab", // 16 digits is too many + sf: func(SamplerMode) string { + return "r-value must have 14 hex digits" + }, + }, + { + name: "invalid t-value", + cfg: &Config{ + SamplingPercentage: 100, + }, + tid: defaultTID, + ts: "ot=th:abababababababab", // 16 digits is too many + sf: func(SamplerMode) string { + return "t-value exceeds 14 hex digits" + }, + }, + { + name: "t-value syntax", + cfg: &Config{ + SamplingPercentage: 100, + }, + tid: defaultTID, + ts: "ot=th:-1", + sf: func(SamplerMode) string { + return "invalid syntax" + }, + }, + { + name: "inconsistent t-value trace ID", + cfg: &Config{ + SamplingPercentage: 100, + }, + tid: mustParseTID("ffffffffffffffffff70000000000000"), + ts: "ot=th:8", + sf: func(SamplerMode) string { + return "inconsistent arriving threshold" + }, + }, + { + name: "inconsistent t-value r-value", + cfg: &Config{ + SamplingPercentage: 100, + }, + tid: defaultTID, + ts: "ot=th:8;rv:70000000000000", + sf: func(SamplerMode) string { + return "inconsistent arriving threshold" + }, + }, + } + for _, tt := range tests { + for _, mode := range []SamplerMode{Equalizing, Proportional} { + t.Run(fmt.Sprint(mode, "_", tt.name), func(t *testing.T) { + sink := new(consumertest.TracesSink) + cfg := &Config{} + if tt.cfg != nil { + *cfg = *tt.cfg + } + cfg.Mode = mode + cfg.FailClosed = true + + set := processortest.NewNopSettings() + logger, observed := observer.New(zap.DebugLevel) + set.Logger = zap.New(logger) + + expectMessage := "" + if tt.sf != nil { + expectMessage = tt.sf(mode) + + } + + tsp, err := newTracesProcessor(context.Background(), set, cfg, sink) + require.NoError(t, err) + + td := makeSingleSpanWithAttrib(tt.tid, sid, tt.ts, "", pcommon.Value{}) + + err = tsp.ConsumeTraces(context.Background(), td) + require.NoError(t, err) + + sampledData := sink.AllTraces() + + require.Equal(t, 0, len(sampledData)) + assert.Equal(t, 0, sink.SpanCount()) + + require.Equal(t, 1, len(observed.All()), "should have one log: %v", observed.All()) + if observed.All()[0].Message == "trace sampler" { + require.Contains(t, observed.All()[0].Context[0].Interface.(error).Error(), expectMessage) + } else { + require.Contains(t, observed.All()[0].Message, "traces sampler") + require.Contains(t, observed.All()[0].Context[0].Interface.(error).Error(), expectMessage) + } + }) + } + } +} + +// Test_tracesamplerprocessor_HashSeedTraceState tests that non-strict +// HashSeed modes generate trace state to indicate sampling. +func Test_tracesamplerprocessor_HashSeedTraceState(t *testing.T) { + sid := idutils.UInt64ToSpanID(0xfefefefe) + tests := []struct { + pct float32 + tvout string + }{ + { + pct: 100, + tvout: "0", + }, + { + pct: 75, + tvout: "4", + }, + { + pct: 50, + tvout: "8", + }, + { + pct: 25, + tvout: "c", + }, + { + pct: 10, + tvout: "e668", // 14-bit rounding means e668 vs e666. + }, + { + pct: 100.0 / 3, + tvout: "aaac", // 14-bit rounding means aaac, vs aaab. + }, + } + for _, tt := range tests { + t.Run(fmt.Sprint(tt.pct, "pct"), func(t *testing.T) { + sink := new(consumertest.TracesSink) + cfg := &Config{} + cfg.SamplingPercentage = tt.pct + cfg.Mode = HashSeed + cfg.HashSeed = defaultHashSeed + cfg.SamplingPrecision = 4 + + tsp, err := newTracesProcessor(context.Background(), processortest.NewNopSettings(), cfg, sink) + require.NoError(t, err) + + // Repeat until we find 10 sampled cases; each sample will have + // an independent R-value. + const find = 10 + found := 0 + for { + sink.Reset() + tid := idutils.UInt64ToTraceID(rand.Uint64(), rand.Uint64()) + td := makeSingleSpanWithAttrib(tid, sid, "", "", pcommon.Value{}) + + err = tsp.ConsumeTraces(context.Background(), td) + require.NoError(t, err) + + sampledData := sink.AllTraces() + + if len(sampledData) == 0 { + continue + } + assert.Equal(t, 1, sink.SpanCount()) + + span := sampledData[0].ResourceSpans().At(0).ScopeSpans().At(0).Spans().At(0) + spanTs, err := sampling.NewW3CTraceState(span.TraceState().AsRaw()) + require.NoError(t, err) + + threshold, hasT := spanTs.OTelValue().TValueThreshold() + require.True(t, hasT) + require.Equal(t, tt.tvout, spanTs.OTelValue().TValue()) + rnd, hasR := spanTs.OTelValue().RValueRandomness() + require.True(t, hasR) + require.True(t, threshold.ShouldSample(rnd)) + + if found++; find == found { + break + } + } + }) + } +} + func getSpanWithAttributes(key string, value pcommon.Value) ptrace.Span { span := ptrace.NewSpan() initSpanWithAttribute(key, value, span) @@ -747,3 +1382,17 @@ func TestHashingFunction(t *testing.T) { require.Equal(t, tc.sampled, wasSampled) } } + +// makeSingleSpanWithAttrib is used to construct test data with +// a specific TraceID and a single attribute. +func makeSingleSpanWithAttrib(tid pcommon.TraceID, sid pcommon.SpanID, ts string, key string, attribValue pcommon.Value) ptrace.Traces { + traces := ptrace.NewTraces() + span := traces.ResourceSpans().AppendEmpty().ScopeSpans().AppendEmpty().Spans().AppendEmpty() + span.TraceState().FromRaw(ts) + span.SetTraceID(tid) + span.SetSpanID(sid) + if key != "" { + attribValue.CopyTo(span.Attributes().PutEmpty(key)) + } + return traces +} From 1b172e70b984e6cdaf1d9574a73a9f18fddce10b Mon Sep 17 00:00:00 2001 From: Adriel Perkins Date: Thu, 13 Jun 2024 12:27:28 -0400 Subject: [PATCH 003/138] [chore] update gqlparser indirect dependency to v2.5.16 to fix graphql generation (#33552) This is a byproduct of [this issue](https://github.com/Khan/genqlient/issues/340) posted in the Genqlient repository of the gqlparser dependency being updated. This was patched in an updated version v2.5.16 **Description:** Upgraded indirect dependency to take in patch fixing workflow errors. **Link to tracking Issue:** #33551 --- internal/tools/go.mod | 6 +++--- internal/tools/go.sum | 12 ++++++------ receiver/gitproviderreceiver/go.mod | 6 +++--- receiver/gitproviderreceiver/go.sum | 12 ++++++------ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/internal/tools/go.mod b/internal/tools/go.mod index 2c6d9793d2de..d038a8729776 100644 --- a/internal/tools/go.mod +++ b/internal/tools/go.mod @@ -42,8 +42,8 @@ require ( github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371 // indirect github.com/agnivade/levenshtein v1.1.1 // indirect github.com/alecthomas/go-check-sumtype v0.1.4 // indirect - github.com/alexflint/go-arg v1.4.2 // indirect - github.com/alexflint/go-scalar v1.0.0 // indirect + github.com/alexflint/go-arg v1.5.0 // indirect + github.com/alexflint/go-scalar v1.2.0 // indirect github.com/alexkohler/nakedret/v2 v2.0.4 // indirect github.com/alexkohler/prealloc v1.0.0 // indirect github.com/alingse/asasalint v0.0.11 // indirect @@ -215,7 +215,7 @@ require ( github.com/ultraware/funlen v0.1.0 // indirect github.com/ultraware/whitespace v0.1.1 // indirect github.com/uudashr/gocognit v1.1.2 // indirect - github.com/vektah/gqlparser/v2 v2.5.14 // indirect + github.com/vektah/gqlparser/v2 v2.5.16 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect github.com/xen0n/gosmopolitan v1.2.2 // indirect github.com/yagipy/maintidx v1.0.0 // indirect diff --git a/internal/tools/go.sum b/internal/tools/go.sum index 4eb560589175..5b7b1d701011 100644 --- a/internal/tools/go.sum +++ b/internal/tools/go.sum @@ -41,10 +41,10 @@ github.com/alecthomas/go-check-sumtype v0.1.4 h1:WCvlB3l5Vq5dZQTFmodqL2g68uHiSww github.com/alecthomas/go-check-sumtype v0.1.4/go.mod h1:WyYPfhfkdhyrdaligV6svFopZV8Lqdzn5pyVBaV6jhQ= github.com/alecthomas/repr v0.2.0 h1:HAzS41CIzNW5syS8Mf9UwXhNH1J9aix/BvDRf1Ml2Yk= github.com/alecthomas/repr v0.2.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= -github.com/alexflint/go-arg v1.4.2 h1:lDWZAXxpAnZUq4qwb86p/3rIJJ2Li81EoMbTMujhVa0= -github.com/alexflint/go-arg v1.4.2/go.mod h1:9iRbDxne7LcR/GSvEr7ma++GLpdIU1zrghf2y2768kM= -github.com/alexflint/go-scalar v1.0.0 h1:NGupf1XV/Xb04wXskDFzS0KWOLH632W/EO4fAFi+A70= -github.com/alexflint/go-scalar v1.0.0/go.mod h1:GpHzbCOZXEKMEcygYQ5n/aa4Aq84zbxjy3MxYW0gjYw= +github.com/alexflint/go-arg v1.5.0 h1:rwMKGiaQuRbXfZNyRUvIfke63QvOBt1/QTshlGQHohM= +github.com/alexflint/go-arg v1.5.0/go.mod h1:A7vTJzvjoaSTypg4biM5uYNTkJ27SkNTArtYXnlqVO8= +github.com/alexflint/go-scalar v1.2.0 h1:WR7JPKkeNpnYIOfHRa7ivM21aWAdHD0gEWHCx+WQBRw= +github.com/alexflint/go-scalar v1.2.0/go.mod h1:LoFvNMqS1CPrMVltza4LvnGKhaSpc3oyLEBUZVhhS2o= github.com/alexkohler/nakedret/v2 v2.0.4 h1:yZuKmjqGi0pSmjGpOC016LtPJysIL0WEUiaXW5SUnNg= github.com/alexkohler/nakedret/v2 v2.0.4/go.mod h1:bF5i0zF2Wo2o4X4USt9ntUWve6JbFv02Ff4vlkmS/VU= github.com/alexkohler/prealloc v1.0.0 h1:Hbq0/3fJPQhNkN0dR95AVrr6R7tou91y0uHG5pOcUuw= @@ -509,8 +509,8 @@ github.com/ultraware/whitespace v0.1.1 h1:bTPOGejYFulW3PkcrqkeQwOd6NKOOXvmGD9bo/ github.com/ultraware/whitespace v0.1.1/go.mod h1:XcP1RLD81eV4BW8UhQlpaR+SDc2givTvyI8a586WjW8= github.com/uudashr/gocognit v1.1.2 h1:l6BAEKJqQH2UpKAPKdMfZf5kE4W/2xk8pfU1OVLvniI= github.com/uudashr/gocognit v1.1.2/go.mod h1:aAVdLURqcanke8h3vg35BC++eseDm66Z7KmchI5et4k= -github.com/vektah/gqlparser/v2 v2.5.14 h1:dzLq75BJe03jjQm6n56PdH1oweB8ana42wj7E4jRy70= -github.com/vektah/gqlparser/v2 v2.5.14/go.mod h1:WQQjFc+I1YIzoPvZBhUQX7waZgg3pMLi0r8KymvAE2w= +github.com/vektah/gqlparser/v2 v2.5.16 h1:1gcmLTvs3JLKXckwCwlUagVn/IlV2bwqle0vJ0vy5p8= +github.com/vektah/gqlparser/v2 v2.5.16/go.mod h1:1lz1OeCqgQbQepsGxPVywrjdBHW2T08PUS3pJqepRww= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= github.com/xen0n/gosmopolitan v1.2.2 h1:/p2KTnMzwRexIW8GlKawsTWOxn7UHA+jCMF/V8HHtvU= diff --git a/receiver/gitproviderreceiver/go.mod b/receiver/gitproviderreceiver/go.mod index e4f1b349e3e2..82f51ae57357 100644 --- a/receiver/gitproviderreceiver/go.mod +++ b/receiver/gitproviderreceiver/go.mod @@ -44,7 +44,7 @@ require ( github.com/hashicorp/go-version v1.7.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.17.8 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/knadh/koanf/maps v0.1.1 // indirect github.com/knadh/koanf/providers/confmap v0.1.0 // indirect github.com/knadh/koanf/v2 v2.1.1 // indirect @@ -67,7 +67,7 @@ require ( github.com/spf13/pflag v1.0.5 // indirect github.com/tklauser/go-sysconf v0.3.13 // indirect github.com/tklauser/numcpus v0.7.0 // indirect - github.com/vektah/gqlparser/v2 v2.5.12 // indirect + github.com/vektah/gqlparser/v2 v2.5.16 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/collector v0.102.2-0.20240611143128-7dfb57b9ad1c // indirect @@ -114,7 +114,7 @@ require ( golang.org/x/text v0.16.0 // indirect gonum.org/v1/gonum v0.15.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240610135401-a8a62080eff3 // indirect google.golang.org/grpc v1.64.0 // indirect google.golang.org/protobuf v1.34.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/receiver/gitproviderreceiver/go.sum b/receiver/gitproviderreceiver/go.sum index 39887f7a8488..5123afe8525f 100644 --- a/receiver/gitproviderreceiver/go.sum +++ b/receiver/gitproviderreceiver/go.sum @@ -80,8 +80,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.8 h1:YcnTYrq7MikUT7k0Yb5eceMmALQPYBW/Xltxn0NAMnU= -github.com/klauspost/compress v1.17.8/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/knadh/koanf/maps v0.1.1 h1:G5TjmUh2D7G2YWf5SQQqSiHRJEjaicvU0KpypqB3NIs= github.com/knadh/koanf/maps v0.1.1/go.mod h1:npD/QZY3V6ghQDdcQzl1W4ICNVTkohC8E73eI2xW4yI= github.com/knadh/koanf/providers/confmap v0.1.0 h1:gOkxhHkemwG4LezxxN8DMOFopOPghxRVp7JbIvdvqzU= @@ -146,8 +146,8 @@ github.com/tklauser/go-sysconf v0.3.13 h1:GBUpcahXSpR2xN01jhkNAbTLRk2Yzgggk8IM08 github.com/tklauser/go-sysconf v0.3.13/go.mod h1:zwleP4Q4OehZHGn4CYZDipCgg9usW5IJePewFCGVEa0= github.com/tklauser/numcpus v0.7.0 h1:yjuerZP127QG9m5Zh/mSO4wqurYil27tHrqwRoRjpr4= github.com/tklauser/numcpus v0.7.0/go.mod h1:bb6dMVcj8A42tSE7i32fsIUCbQNllK5iDguyOZRUzAY= -github.com/vektah/gqlparser/v2 v2.5.12 h1:COMhVVnql6RoaF7+aTBWiTADdpLGyZWU3K/NwW0ph98= -github.com/vektah/gqlparser/v2 v2.5.12/go.mod h1:WQQjFc+I1YIzoPvZBhUQX7waZgg3pMLi0r8KymvAE2w= +github.com/vektah/gqlparser/v2 v2.5.16 h1:1gcmLTvs3JLKXckwCwlUagVn/IlV2bwqle0vJ0vy5p8= +github.com/vektah/gqlparser/v2 v2.5.16/go.mod h1:1lz1OeCqgQbQepsGxPVywrjdBHW2T08PUS3pJqepRww= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= @@ -326,8 +326,8 @@ google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98 google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5 h1:P8OJ/WCl/Xo4E4zoe4/bifHpSmmKwARqyqE4nW6J2GQ= google.golang.org/genproto/googleapis/api v0.0.0-20240520151616-dc85e6b867a5/go.mod h1:RGnPtTG7r4i8sPlNyDeikXF99hMM+hN6QMm4ooG9g2g= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 h1:Zy9XzmMEflZ/MAaA7vNcoebnRAld7FsPW1EeBB7V0m8= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240610135401-a8a62080eff3 h1:9Xyg6I9IWQZhRVfCWjKK+l6kI0jHcPesVlMnT//aHNo= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240610135401-a8a62080eff3/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= From b03a61aa1f409096d71f133ba5736658a34b888e Mon Sep 17 00:00:00 2001 From: Daniel Jaglowski Date: Thu, 13 Jun 2024 12:00:43 -0500 Subject: [PATCH 004/138] [processor/transform] Add 'transform.flatten.logs' feature gate (#33338) This PR proposes a feature gate which would enable the Flatten/Unflatten behavior described [here](https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/32080#issuecomment-2120764953). This was discussed in the Collector SIG recently and it was mentioned that a feature gate would be a reasonable way to implement this. One immediate question: Should this be purely a feature gate, or should there be a config option on the processor which fails `Validate` if the feature gate is not set? --------- Co-authored-by: Curtis Robert Co-authored-by: Evan Bradley <11745660+evan-bradley@users.noreply.github.com> Co-authored-by: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com> --- .chloggen/transform-flatten-logs.yaml | 28 ++++ cmd/otelcontribcol/builder-config.yaml | 1 + cmd/otelcontribcol/go.mod | 3 + connector/datadogconnector/go.mod | 2 + exporter/datadogexporter/go.mod | 3 + .../datadogexporter/integrationtest/go.mod | 2 + go.mod | 3 + processor/transformprocessor/README.md | 25 +++ processor/transformprocessor/config.go | 18 +++ processor/transformprocessor/factory.go | 2 +- processor/transformprocessor/go.mod | 4 + .../internal/logs/processor.go | 9 +- .../internal/logs/processor_test.go | 10 +- .../transformprocessor/processor_test.go | 147 ++++++++++++++++++ .../testdata/logs/expected-with-flatten.yaml | 27 ++++ .../logs/expected-without-flatten.yaml | 20 +++ .../testdata/logs/input.yaml | 23 +++ 17 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 .chloggen/transform-flatten-logs.yaml create mode 100644 processor/transformprocessor/processor_test.go create mode 100644 processor/transformprocessor/testdata/logs/expected-with-flatten.yaml create mode 100644 processor/transformprocessor/testdata/logs/expected-without-flatten.yaml create mode 100644 processor/transformprocessor/testdata/logs/input.yaml diff --git a/.chloggen/transform-flatten-logs.yaml b/.chloggen/transform-flatten-logs.yaml new file mode 100644 index 000000000000..0f3d76368188 --- /dev/null +++ b/.chloggen/transform-flatten-logs.yaml @@ -0,0 +1,28 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: processor/transform + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add `transform.flatten.logs` featuregate to give each log record a distinct resource and scope. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [32080] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: | + This option is useful when applying transformations which alter the resource or scope. e.g. `set(resource.attributes["to"], attributes["from"])`, which may otherwise result in unexpected behavior. Using this option typically incurs a performance penalty as the processor must compute many hashes and create copies of resource and scope information for every log record. + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/cmd/otelcontribcol/builder-config.yaml b/cmd/otelcontribcol/builder-config.yaml index fbad43a10bec..82476f3e1215 100644 --- a/cmd/otelcontribcol/builder-config.yaml +++ b/cmd/otelcontribcol/builder-config.yaml @@ -479,3 +479,4 @@ replaces: - github.com/open-telemetry/opentelemetry-collector-contrib/confmap/provider/s3provider => ../../confmap/provider/s3provider - github.com/open-telemetry/opentelemetry-collector-contrib/confmap/provider/secretsmanagerprovider => ../../confmap/provider/secretsmanagerprovider - github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling => ../../pkg/sampling + - github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil => ../../internal/pdatautil diff --git a/cmd/otelcontribcol/go.mod b/cmd/otelcontribcol/go.mod index 89c64f749517..7828090b6cb5 100644 --- a/cmd/otelcontribcol/go.mod +++ b/cmd/otelcontribcol/go.mod @@ -623,6 +623,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/internal/kafka v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/kubelet v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/metadataproviders v0.102.0 // indirect + github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/sharedcomponent v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/splunk v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/sqlquery v0.102.0 // indirect @@ -1297,3 +1298,5 @@ replace github.com/open-telemetry/opentelemetry-collector-contrib/confmap/provid replace github.com/open-telemetry/opentelemetry-collector-contrib/confmap/provider/secretsmanagerprovider => ../../confmap/provider/secretsmanagerprovider replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling => ../../pkg/sampling + +replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil => ../../internal/pdatautil diff --git a/connector/datadogconnector/go.mod b/connector/datadogconnector/go.mod index 0e40df474a0a..c22be6ebed40 100644 --- a/connector/datadogconnector/go.mod +++ b/connector/datadogconnector/go.mod @@ -336,3 +336,5 @@ replace github.com/openshift/api v3.9.0+incompatible => github.com/openshift/api replace github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor => ../../processor/transformprocessor replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling => ../../pkg/sampling + +replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil => ../../internal/pdatautil diff --git a/exporter/datadogexporter/go.mod b/exporter/datadogexporter/go.mod index 90d46e041d6c..f3c77b77ed94 100644 --- a/exporter/datadogexporter/go.mod +++ b/exporter/datadogexporter/go.mod @@ -249,6 +249,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/internal/common v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/docker v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/filter v0.102.0 // indirect + github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling v0.102.0 // indirect @@ -430,3 +431,5 @@ replace github.com/open-telemetry/opentelemetry-collector-contrib/extension/stor replace github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor => ../../processor/transformprocessor replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling => ../../pkg/sampling + +replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil => ../../internal/pdatautil diff --git a/exporter/datadogexporter/integrationtest/go.mod b/exporter/datadogexporter/integrationtest/go.mod index 7fdabda6a36a..96ab696567f1 100644 --- a/exporter/datadogexporter/integrationtest/go.mod +++ b/exporter/datadogexporter/integrationtest/go.mod @@ -345,3 +345,5 @@ replace github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prome replace github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor => ../../../processor/transformprocessor replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling => ../../../pkg/sampling + +replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil => ../../../internal/pdatautil diff --git a/go.mod b/go.mod index d731d5616cbc..d77127245605 100644 --- a/go.mod +++ b/go.mod @@ -578,6 +578,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/internal/kafka v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/kubelet v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/metadataproviders v0.102.0 // indirect + github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/sharedcomponent v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/splunk v0.102.0 // indirect github.com/open-telemetry/opentelemetry-collector-contrib/internal/sqlquery v0.102.0 // indirect @@ -1237,3 +1238,5 @@ replace github.com/open-telemetry/opentelemetry-collector-contrib/extension/acke replace github.com/open-telemetry/opentelemetry-collector-contrib/receiver/splunkenterprisereceiver => ./receiver/splunkenterprisereceiver replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling => ./pkg/sampling + +replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil => ./internal/pdatautil diff --git a/processor/transformprocessor/README.md b/processor/transformprocessor/README.md index 02ccd3ddfc42..daab05325aec 100644 --- a/processor/transformprocessor/README.md +++ b/processor/transformprocessor/README.md @@ -529,3 +529,28 @@ The transform processor uses the [OpenTelemetry Transformation Language](https:/ - Although the OTTL allows the `set` function to be used with `metric.data_type`, its implementation in the transform processor is NOOP. To modify a data type you must use a function specific to that purpose. - [Identity Conflict](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/standard-warnings.md#identity-conflict): Transformation of metrics have the potential to affect the identity of a metric leading to an Identity Crisis. Be especially cautious when transforming metric name and when reducing/changing existing attributes. Adding new attributes is safe. - [Orphaned Telemetry](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/standard-warnings.md#orphaned-telemetry): The processor allows you to modify `span_id`, `trace_id`, and `parent_span_id` for traces and `span_id`, and `trace_id` logs. Modifying these fields could lead to orphaned spans or logs. + +## Feature Gate + +### `transform.flatten.logs` + +The `transform.flatten.logs` [feature gate](https://github.com/open-telemetry/opentelemetry-collector/blob/main/featuregate/README.md#collector-feature-gates) enables the `flatten_data` configuration option (default `false`). With `flatten_data: true`, the processor provides each log record with a distinct copy of its resource and scope. Then, after applying all transformations, the log records are regrouped by resource and scope. + +This option is useful when applying transformations which alter the resource or scope. e.g. `set(resource.attributes["to"], attributes["from"])`, which may otherwise result in unexpected behavior. Using this option typically incurs a performance penalty as the processor must compute many hashes and create copies of resource and scope information for every log record. + +The feature is currently only available for log processing. + +#### Example Usage + +`config.yaml`: + + ```yaml + transform: + flatten_data: true + log_statements: + - context: log + statements: + - set(resource.attributes["to"], attributes["from"]) + ``` + + Run collector: `./otelcol --config config.yaml --feature-gates=transform.flatten.logs` diff --git a/processor/transformprocessor/config.go b/processor/transformprocessor/config.go index c81700201aad..8c184510ed5a 100644 --- a/processor/transformprocessor/config.go +++ b/processor/transformprocessor/config.go @@ -4,7 +4,10 @@ package transformprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor" import ( + "errors" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/featuregate" "go.uber.org/multierr" "go.uber.org/zap" @@ -15,6 +18,15 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor/internal/traces" ) +var ( + flatLogsFeatureGate = featuregate.GlobalRegistry().MustRegister("transform.flatten.logs", featuregate.StageAlpha, + featuregate.WithRegisterDescription("Flatten log data prior to transformation so every record has a unique copy of the resource and scope. Regroups logs based on resource and scope after transformations."), + featuregate.WithRegisterFromVersion("v0.103.0"), + featuregate.WithRegisterReferenceURL("https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/32080#issuecomment-2120764953"), + ) + errFlatLogsGateDisabled = errors.New("'flatten_data' requires the 'transform.flatten.logs' feature gate to be enabled") +) + // Config defines the configuration for the processor. type Config struct { // ErrorMode determines how the processor reacts to errors that occur while processing a statement. @@ -27,6 +39,8 @@ type Config struct { TraceStatements []common.ContextStatements `mapstructure:"trace_statements"` MetricStatements []common.ContextStatements `mapstructure:"metric_statements"` LogStatements []common.ContextStatements `mapstructure:"log_statements"` + + FlattenData bool `mapstructure:"flatten_data"` } var _ component.Config = (*Config)(nil) @@ -73,5 +87,9 @@ func (c *Config) Validate() error { } } + if c.FlattenData && !flatLogsFeatureGate.IsEnabled() { + errors = multierr.Append(errors, errFlatLogsGateDisabled) + } + return errors } diff --git a/processor/transformprocessor/factory.go b/processor/transformprocessor/factory.go index a22856fbe9c1..7138907dc303 100644 --- a/processor/transformprocessor/factory.go +++ b/processor/transformprocessor/factory.go @@ -49,7 +49,7 @@ func createLogsProcessor( ) (processor.Logs, error) { oCfg := cfg.(*Config) - proc, err := logs.NewProcessor(oCfg.LogStatements, oCfg.ErrorMode, set.TelemetrySettings) + proc, err := logs.NewProcessor(oCfg.LogStatements, oCfg.ErrorMode, oCfg.FlattenData, set.TelemetrySettings) if err != nil { return nil, fmt.Errorf("invalid config for \"transform\" processor %w", err) } diff --git a/processor/transformprocessor/go.mod b/processor/transformprocessor/go.mod index 874a4ebdf9e5..6fd8e9029d38 100644 --- a/processor/transformprocessor/go.mod +++ b/processor/transformprocessor/go.mod @@ -5,6 +5,8 @@ go 1.21.0 require ( github.com/open-telemetry/opentelemetry-collector-contrib/internal/common v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/internal/filter v0.102.0 + github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil v0.102.0 + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.102.0 github.com/stretchr/testify v1.9.0 @@ -86,3 +88,5 @@ replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden => replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/common => ../../internal/common replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/filter => ../../internal/filter + +replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil => ../../internal/pdatautil diff --git a/processor/transformprocessor/internal/logs/processor.go b/processor/transformprocessor/internal/logs/processor.go index d9502e97762f..e2b184f3c8d7 100644 --- a/processor/transformprocessor/internal/logs/processor.go +++ b/processor/transformprocessor/internal/logs/processor.go @@ -12,6 +12,7 @@ import ( "go.uber.org/multierr" "go.uber.org/zap" + "github.com/open-telemetry/opentelemetry-collector-contrib/internal/pdatautil" "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor/internal/common" ) @@ -19,9 +20,10 @@ import ( type Processor struct { contexts []consumer.Logs logger *zap.Logger + flatMode bool } -func NewProcessor(contextStatements []common.ContextStatements, errorMode ottl.ErrorMode, settings component.TelemetrySettings) (*Processor, error) { +func NewProcessor(contextStatements []common.ContextStatements, errorMode ottl.ErrorMode, flatMode bool, settings component.TelemetrySettings) (*Processor, error) { pc, err := common.NewLogParserCollection(settings, common.WithLogParser(LogFunctions()), common.WithLogErrorMode(errorMode)) if err != nil { return nil, err @@ -44,10 +46,15 @@ func NewProcessor(contextStatements []common.ContextStatements, errorMode ottl.E return &Processor{ contexts: contexts, logger: settings.Logger, + flatMode: flatMode, }, nil } func (p *Processor) ProcessLogs(ctx context.Context, ld plog.Logs) (plog.Logs, error) { + if p.flatMode { + pdatautil.FlattenLogs(ld.ResourceLogs()) + defer pdatautil.GroupByResourceLogs(ld.ResourceLogs()) + } for _, c := range p.contexts { err := c.ConsumeLogs(ctx, ld) if err != nil { diff --git a/processor/transformprocessor/internal/logs/processor_test.go b/processor/transformprocessor/internal/logs/processor_test.go index f401093b7d30..60a7c79105ea 100644 --- a/processor/transformprocessor/internal/logs/processor_test.go +++ b/processor/transformprocessor/internal/logs/processor_test.go @@ -49,7 +49,7 @@ func Test_ProcessLogs_ResourceContext(t *testing.T) { for _, tt := range tests { t.Run(tt.statement, func(t *testing.T) { td := constructLogs() - processor, err := NewProcessor([]common.ContextStatements{{Context: "resource", Statements: []string{tt.statement}}}, ottl.IgnoreError, componenttest.NewNopTelemetrySettings()) + processor, err := NewProcessor([]common.ContextStatements{{Context: "resource", Statements: []string{tt.statement}}}, ottl.IgnoreError, false, componenttest.NewNopTelemetrySettings()) assert.NoError(t, err) _, err = processor.ProcessLogs(context.Background(), td) @@ -84,7 +84,7 @@ func Test_ProcessLogs_ScopeContext(t *testing.T) { for _, tt := range tests { t.Run(tt.statement, func(t *testing.T) { td := constructLogs() - processor, err := NewProcessor([]common.ContextStatements{{Context: "scope", Statements: []string{tt.statement}}}, ottl.IgnoreError, componenttest.NewNopTelemetrySettings()) + processor, err := NewProcessor([]common.ContextStatements{{Context: "scope", Statements: []string{tt.statement}}}, ottl.IgnoreError, false, componenttest.NewNopTelemetrySettings()) assert.NoError(t, err) _, err = processor.ProcessLogs(context.Background(), td) @@ -338,7 +338,7 @@ func Test_ProcessLogs_LogContext(t *testing.T) { for _, tt := range tests { t.Run(tt.statement, func(t *testing.T) { td := constructLogs() - processor, err := NewProcessor([]common.ContextStatements{{Context: "log", Statements: []string{tt.statement}}}, ottl.IgnoreError, componenttest.NewNopTelemetrySettings()) + processor, err := NewProcessor([]common.ContextStatements{{Context: "log", Statements: []string{tt.statement}}}, ottl.IgnoreError, false, componenttest.NewNopTelemetrySettings()) assert.NoError(t, err) _, err = processor.ProcessLogs(context.Background(), td) @@ -455,7 +455,7 @@ func Test_ProcessLogs_MixContext(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { td := constructLogs() - processor, err := NewProcessor(tt.contextStatments, ottl.IgnoreError, componenttest.NewNopTelemetrySettings()) + processor, err := NewProcessor(tt.contextStatments, ottl.IgnoreError, false, componenttest.NewNopTelemetrySettings()) assert.NoError(t, err) _, err = processor.ProcessLogs(context.Background(), td) @@ -488,7 +488,7 @@ func Test_ProcessTraces_Error(t *testing.T) { for _, tt := range tests { t.Run(string(tt.context), func(t *testing.T) { td := constructLogs() - processor, err := NewProcessor([]common.ContextStatements{{Context: tt.context, Statements: []string{`set(attributes["test"], ParseJSON(1))`}}}, ottl.PropagateError, componenttest.NewNopTelemetrySettings()) + processor, err := NewProcessor([]common.ContextStatements{{Context: tt.context, Statements: []string{`set(attributes["test"], ParseJSON(1))`}}}, ottl.PropagateError, false, componenttest.NewNopTelemetrySettings()) assert.NoError(t, err) _, err = processor.ProcessLogs(context.Background(), td) diff --git a/processor/transformprocessor/processor_test.go b/processor/transformprocessor/processor_test.go new file mode 100644 index 000000000000..a617c2728729 --- /dev/null +++ b/processor/transformprocessor/processor_test.go @@ -0,0 +1,147 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package transformprocessor + +import ( + "context" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/consumer/consumertest" + "go.opentelemetry.io/collector/processor/processortest" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest/plogtest" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor/internal/common" +) + +func TestFlattenDataDisabledByDefault(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + oCfg := cfg.(*Config) + assert.False(t, oCfg.FlattenData) + assert.NoError(t, oCfg.Validate()) +} + +func TestFlattenDataRequiresGate(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + oCfg := cfg.(*Config) + oCfg.FlattenData = true + assert.Equal(t, errFlatLogsGateDisabled, oCfg.Validate()) +} + +func TestProcessLogsWithoutFlatten(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + oCfg := cfg.(*Config) + oCfg.LogStatements = []common.ContextStatements{ + { + Context: "log", + Statements: []string{ + `set(resource.attributes["host.name"], attributes["host.name"])`, + `delete_key(attributes, "host.name")`, + }, + }, + } + sink := new(consumertest.LogsSink) + p, err := factory.CreateLogsProcessor(context.Background(), processortest.NewNopSettings(), oCfg, sink) + require.NoError(t, err) + + input, err := golden.ReadLogs(filepath.Join("testdata", "logs", "input.yaml")) + require.NoError(t, err) + expected, err := golden.ReadLogs(filepath.Join("testdata", "logs", "expected-without-flatten.yaml")) + require.NoError(t, err) + + assert.NoError(t, p.ConsumeLogs(context.Background(), input)) + + actual := sink.AllLogs() + require.Len(t, actual, 1) + + assert.NoError(t, plogtest.CompareLogs(expected, actual[0])) +} + +func TestProcessLogsWithFlatten(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + oCfg := cfg.(*Config) + oCfg.FlattenData = true + oCfg.LogStatements = []common.ContextStatements{ + { + Context: "log", + Statements: []string{ + `set(resource.attributes["host.name"], attributes["host.name"])`, + `delete_key(attributes, "host.name")`, + }, + }, + } + sink := new(consumertest.LogsSink) + p, err := factory.CreateLogsProcessor(context.Background(), processortest.NewNopSettings(), oCfg, sink) + require.NoError(t, err) + + input, err := golden.ReadLogs(filepath.Join("testdata", "logs", "input.yaml")) + require.NoError(t, err) + expected, err := golden.ReadLogs(filepath.Join("testdata", "logs", "expected-with-flatten.yaml")) + require.NoError(t, err) + + assert.NoError(t, p.ConsumeLogs(context.Background(), input)) + + actual := sink.AllLogs() + require.Len(t, actual, 1) + + assert.NoError(t, plogtest.CompareLogs(expected, actual[0])) +} + +func BenchmarkLogsWithoutFlatten(b *testing.B) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + oCfg := cfg.(*Config) + oCfg.LogStatements = []common.ContextStatements{ + { + Context: "log", + Statements: []string{ + `set(resource.attributes["host.name"], attributes["host.name"])`, + `delete_key(attributes, "host.name")`, + }, + }, + } + sink := new(consumertest.LogsSink) + p, err := factory.CreateLogsProcessor(context.Background(), processortest.NewNopSettings(), oCfg, sink) + require.NoError(b, err) + + input, err := golden.ReadLogs(filepath.Join("testdata", "logs", "input.yaml")) + require.NoError(b, err) + + for n := 0; n < b.N; n++ { + assert.NoError(b, p.ConsumeLogs(context.Background(), input)) + } +} + +func BenchmarkLogsWithFlatten(b *testing.B) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + oCfg := cfg.(*Config) + oCfg.FlattenData = true + oCfg.LogStatements = []common.ContextStatements{ + { + Context: "log", + Statements: []string{ + `set(resource.attributes["host.name"], attributes["host.name"])`, + `delete_key(attributes, "host.name")`, + }, + }, + } + sink := new(consumertest.LogsSink) + p, err := factory.CreateLogsProcessor(context.Background(), processortest.NewNopSettings(), oCfg, sink) + require.NoError(b, err) + + input, err := golden.ReadLogs(filepath.Join("testdata", "logs", "input.yaml")) + require.NoError(b, err) + + for n := 0; n < b.N; n++ { + assert.NoError(b, p.ConsumeLogs(context.Background(), input)) + } +} diff --git a/processor/transformprocessor/testdata/logs/expected-with-flatten.yaml b/processor/transformprocessor/testdata/logs/expected-with-flatten.yaml new file mode 100644 index 000000000000..e9d036e217ed --- /dev/null +++ b/processor/transformprocessor/testdata/logs/expected-with-flatten.yaml @@ -0,0 +1,27 @@ +resourceLogs: + - resource: + attributes: + - key: host.name + value: + stringValue: HOST.ONE + scopeLogs: + - logRecords: + - body: + stringValue: "hello one" + attributes: + - key: log.file.name + value: + stringValue: one.log + - resource: + attributes: + - key: host.name + value: + stringValue: HOST.TWO + scopeLogs: + - logRecords: + - body: + stringValue: "hello two" + attributes: + - key: log.file.name + value: + stringValue: two.log diff --git a/processor/transformprocessor/testdata/logs/expected-without-flatten.yaml b/processor/transformprocessor/testdata/logs/expected-without-flatten.yaml new file mode 100644 index 000000000000..37803c4aefd1 --- /dev/null +++ b/processor/transformprocessor/testdata/logs/expected-without-flatten.yaml @@ -0,0 +1,20 @@ +resourceLogs: + - resource: + attributes: + - key: host.name + value: + stringValue: HOST.TWO + scopeLogs: + - logRecords: + - body: + stringValue: "hello one" + attributes: + - key: log.file.name + value: + stringValue: one.log + - body: + stringValue: "hello two" + attributes: + - key: log.file.name + value: + stringValue: two.log diff --git a/processor/transformprocessor/testdata/logs/input.yaml b/processor/transformprocessor/testdata/logs/input.yaml new file mode 100644 index 000000000000..811bbc572bbb --- /dev/null +++ b/processor/transformprocessor/testdata/logs/input.yaml @@ -0,0 +1,23 @@ +resourceLogs: + - resource: + attributes: + scopeLogs: + - logRecords: + - body: + stringValue: "hello one" + attributes: + - key: host.name + value: + stringValue: HOST.ONE + - key: log.file.name + value: + stringValue: one.log + - body: + stringValue: "hello two" + attributes: + - key: host.name + value: + stringValue: HOST.TWO + - key: log.file.name + value: + stringValue: two.log From 75194856443295d14e814eaf21724382f086b503 Mon Sep 17 00:00:00 2001 From: Chris Mark Date: Thu, 13 Jun 2024 23:03:37 +0300 Subject: [PATCH 005/138] [chore] [k8s] fix k8s e2e tests (#33548) **Description:** Only return address that is not empty for `kind` network. This started affecting the e2e tests possibly because of the `ubuntu-latest`'s docker version update that is mentioned at https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33520#issuecomment-2165426609. Relates to https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33520. /cc @fatsheep9146 Sample `kind` network: ```console curl --unix-socket /run/docker.sock http://docker/networks/kind | jq % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 841 100 841 0 0 821k 0 --:--:-- --:--:-- --:--:-- 821k { "Name": "kind", "Id": "801d2abe204253cbd5d1d135f111a7fb386b830382bde79a699fb4f9aaf674b1", "Created": "2024-06-13T15:31:57.738509232+03:00", "Scope": "local", "Driver": "bridge", "EnableIPv6": true, "IPAM": { "Driver": "default", "Options": {}, "Config": [ { "Subnet": "fc00:f853:ccd:e793::/64" }, { "Subnet": "172.18.0.0/16", "Gateway": "172.18.0.1" } ] }, "Internal": false, "Attachable": false, "Ingress": false, "ConfigFrom": { "Network": "" }, "ConfigOnly": false, "Containers": { "db113750635782bc1bfdf31e5f62af3c63f02a9c8844f7fe9ef045b5d9b76d12": { "Name": "kind-control-plane", "EndpointID": "8b15bb391109ca1ecfbb4bf7a96060b01e3913694d34e23d67eec22684f037bb", "MacAddress": "02:42:ac:12:00:02", "IPv4Address": "172.18.0.2/16", "IPv6Address": "fc00:f853:ccd:e793::2/64" } }, "Options": { "com.docker.network.bridge.enable_ip_masquerade": "true", "com.docker.network.driver.mtu": "1500" }, "Labels": {} } ``` **Link to tracking Issue:** **Testing:** **Documentation:** --------- Signed-off-by: ChrsMark --- internal/k8stest/k8s_collector.go | 3 +++ internal/k8stest/k8s_data_helpers.go | 4 +++- processor/k8sattributesprocessor/e2e_test.go | 3 --- receiver/k8sclusterreceiver/e2e_test.go | 1 - receiver/k8sclusterreceiver/testdata/e2e/expected.yaml | 8 ++++---- receiver/k8sobjectsreceiver/e2e_test.go | 1 - receiver/kubeletstatsreceiver/e2e_test.go | 1 - 7 files changed, 10 insertions(+), 11 deletions(-) diff --git a/internal/k8stest/k8s_collector.go b/internal/k8stest/k8s_collector.go index d607fd194e1a..87252ee687e6 100644 --- a/internal/k8stest/k8s_collector.go +++ b/internal/k8stest/k8s_collector.go @@ -27,6 +27,9 @@ func CreateCollectorObjects(t *testing.T, client *K8sClient, testID string, mani manifestFiles, err := os.ReadDir(manifestsDir) require.NoErrorf(t, err, "failed to read collector manifests directory %s", manifestsDir) host := HostEndpoint(t) + if host == "" { + require.Fail(t, "host endpoint cannot be empty") + } var podNamespace string var podLabels map[string]any createdObjs := make([]*unstructured.Unstructured, 0, len(manifestFiles)) diff --git a/internal/k8stest/k8s_data_helpers.go b/internal/k8stest/k8s_data_helpers.go index 50be3a2336f5..3fbde5005eae 100644 --- a/internal/k8stest/k8s_data_helpers.go +++ b/internal/k8stest/k8s_data_helpers.go @@ -28,7 +28,9 @@ func HostEndpoint(t *testing.T) string { network, err := client.NetworkInspect(ctx, "kind", types.NetworkInspectOptions{}) require.NoError(t, err) for _, ipam := range network.IPAM.Config { - return ipam.Gateway + if ipam.Gateway != "" { + return ipam.Gateway + } } require.Fail(t, "failed to find host endpoint") return "" diff --git a/processor/k8sattributesprocessor/e2e_test.go b/processor/k8sattributesprocessor/e2e_test.go index 942e9c98f94e..7520f06169b6 100644 --- a/processor/k8sattributesprocessor/e2e_test.go +++ b/processor/k8sattributesprocessor/e2e_test.go @@ -59,7 +59,6 @@ func newExpectedValue(mode int, value string) *expectedValue { // make docker-otelcontribcol // KUBECONFIG=/tmp/kube-config-otelcol-e2e-testing kind load docker-image otelcontribcol:latest func TestE2E_ClusterRBAC(t *testing.T) { - t.Skip("skipping flaky test, see https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33520") testDir := filepath.Join("testdata", "e2e", "clusterrbac") @@ -422,7 +421,6 @@ func TestE2E_ClusterRBAC(t *testing.T) { // Test with `filter::namespace` set and only role binding to collector's SA. We can't get node and namespace labels/annotations. func TestE2E_NamespacedRBAC(t *testing.T) { - t.Skip("skipping flaky test, see https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33520") testDir := filepath.Join("testdata", "e2e", "namespacedrbac") @@ -563,7 +561,6 @@ func TestE2E_NamespacedRBAC(t *testing.T) { // Test with `filter::namespace` set, role binding for namespace-scoped objects (pod, replicaset) and clusterrole // binding for node and namespace objects. func TestE2E_MixRBAC(t *testing.T) { - t.Skip("skipping flaky test, see https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33520") testDir := filepath.Join("testdata", "e2e", "mixrbac") diff --git a/receiver/k8sclusterreceiver/e2e_test.go b/receiver/k8sclusterreceiver/e2e_test.go index 543afb4ce1ec..8e3418c61b11 100644 --- a/receiver/k8sclusterreceiver/e2e_test.go +++ b/receiver/k8sclusterreceiver/e2e_test.go @@ -36,7 +36,6 @@ const testKubeConfig = "/tmp/kube-config-otelcol-e2e-testing" // make docker-otelcontribcol // KUBECONFIG=/tmp/kube-config-otelcol-e2e-testing kind load docker-image otelcontribcol:latest func TestE2E(t *testing.T) { - t.Skip("skipping flaky test, see https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33520") var expected pmetric.Metrics expectedFile := filepath.Join("testdata", "e2e", "expected.yaml") diff --git a/receiver/k8sclusterreceiver/testdata/e2e/expected.yaml b/receiver/k8sclusterreceiver/testdata/e2e/expected.yaml index e641e7d50f2a..c52d7a66b703 100644 --- a/receiver/k8sclusterreceiver/testdata/e2e/expected.yaml +++ b/receiver/k8sclusterreceiver/testdata/e2e/expected.yaml @@ -812,7 +812,7 @@ resourceMetrics: stringValue: 1a5b9c371c8a7c5d8b0e56a82395aeee88523b1e2d96f17b4a6ae22bf11936bb - key: container.image.name value: - stringValue: registry.k8s.io/kube-apiserver + stringValue: registry.k8s.io/kube-apiserver-amd64 - key: container.image.tag value: stringValue: v1.25.3 @@ -865,7 +865,7 @@ resourceMetrics: stringValue: 2e506922310bbf1ffb8dbbf56c04e540306f272b794d89ffbe776fe5e2fc148e - key: container.image.name value: - stringValue: registry.k8s.io/kube-scheduler + stringValue: registry.k8s.io/kube-scheduler-amd64 - key: container.image.tag value: stringValue: v1.25.3 @@ -918,7 +918,7 @@ resourceMetrics: stringValue: 3baa03c525095d74e7ee24a5c4c42a4680b131f9b8a68f5e2e853ae569d97e4c - key: container.image.name value: - stringValue: registry.k8s.io/kube-controller-manager + stringValue: registry.k8s.io/kube-controller-manager-amd64 - key: container.image.tag value: stringValue: v1.25.3 @@ -971,7 +971,7 @@ resourceMetrics: stringValue: 5cfead143bc88798f93fae8e05586b1191771477030fe89ed7bca288bb82c0aa - key: container.image.name value: - stringValue: registry.k8s.io/kube-proxy + stringValue: registry.k8s.io/kube-proxy-amd64 - key: container.image.tag value: stringValue: v1.25.3 diff --git a/receiver/k8sobjectsreceiver/e2e_test.go b/receiver/k8sobjectsreceiver/e2e_test.go index 78031c421677..4af2cb1e9c21 100644 --- a/receiver/k8sobjectsreceiver/e2e_test.go +++ b/receiver/k8sobjectsreceiver/e2e_test.go @@ -40,7 +40,6 @@ const ( ) func TestE2E(t *testing.T) { - t.Skip("skipping flaky test, see https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33520") k8sClient, err := k8stest.NewK8sClient(testKubeConfig) require.NoError(t, err) diff --git a/receiver/kubeletstatsreceiver/e2e_test.go b/receiver/kubeletstatsreceiver/e2e_test.go index cb8b832e7114..ba5ace29985b 100644 --- a/receiver/kubeletstatsreceiver/e2e_test.go +++ b/receiver/kubeletstatsreceiver/e2e_test.go @@ -28,7 +28,6 @@ import ( const testKubeConfig = "/tmp/kube-config-otelcol-e2e-testing" func TestE2E(t *testing.T) { - t.Skip("skipping flaky test, see https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33520") var expected pmetric.Metrics expectedFile := filepath.Join("testdata", "e2e", "expected.yaml") From 71bb4b086ffcee974603d6be5f50548eedb40510 Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Thu, 13 Jun 2024 14:28:55 -0700 Subject: [PATCH 006/138] Updates to OTel-Arrow v0.24.0 deps (#33518) **Description:** This is a copy of the otel-arrow repository exporter and receiver components at https://github.com/open-telemetry/otel-arrow/commit/dd6e2249a180a51433a93751680fa67dc9c6432e Updates both go.mods to OTel-Arrow v0.24.0 dependencies, the current release. Applies repo-specific `make gci` logic. **Link to tracking Issue:** #26491 --- .chloggen/otel-arrow-v024.yaml | 27 + exporter/otelarrowexporter/go.mod | 5 +- exporter/otelarrowexporter/go.sum | 6 +- .../internal/arrow/bestofn.go | 13 +- .../internal/arrow/exporter.go | 19 +- .../internal/arrow/exporter_test.go | 26 +- .../internal/arrow/prioritizer.go | 7 +- .../internal/arrow/stream.go | 62 +- .../internal/arrow/stream_test.go | 15 +- exporter/otelarrowexporter/otelarrow.go | 2 +- exporter/otelarrowexporter/otelarrow_test.go | 26 +- receiver/otelarrowreceiver/README.md | 7 + receiver/otelarrowreceiver/config.go | 10 + receiver/otelarrowreceiver/config_test.go | 8 +- receiver/otelarrowreceiver/factory.go | 10 +- receiver/otelarrowreceiver/go.mod | 8 +- receiver/otelarrowreceiver/go.sum | 13 +- .../otelarrowreceiver/internal/arrow/arrow.go | 736 +++++++++++++----- .../internal/arrow/arrow_test.go | 219 ++++-- receiver/otelarrowreceiver/otelarrow.go | 14 +- receiver/otelarrowreceiver/otelarrow_test.go | 118 ++- .../otelarrowreceiver/testdata/config.yaml | 2 + 22 files changed, 1014 insertions(+), 339 deletions(-) create mode 100644 .chloggen/otel-arrow-v024.yaml diff --git a/.chloggen/otel-arrow-v024.yaml b/.chloggen/otel-arrow-v024.yaml new file mode 100644 index 000000000000..f873aa9691a2 --- /dev/null +++ b/.chloggen/otel-arrow-v024.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: OTel-Arrow + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Update to OTel-Arrow v0.24.0 + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [26491] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/exporter/otelarrowexporter/go.mod b/exporter/otelarrowexporter/go.mod index 529971751603..86fa96bdeb25 100644 --- a/exporter/otelarrowexporter/go.mod +++ b/exporter/otelarrowexporter/go.mod @@ -3,9 +3,9 @@ module github.com/open-telemetry/opentelemetry-collector-contrib/exporter/otelar go 1.21.0 require ( - github.com/apache/arrow/go/v14 v14.0.2 + github.com/apache/arrow/go/v16 v16.1.0 github.com/open-telemetry/otel-arrow v0.24.0 - github.com/open-telemetry/otel-arrow/collector v0.23.0 + github.com/open-telemetry/otel-arrow/collector v0.24.0 github.com/stretchr/testify v1.9.0 go.opentelemetry.io/collector v0.102.2-0.20240611143128-7dfb57b9ad1c go.opentelemetry.io/collector/component v0.102.2-0.20240611143128-7dfb57b9ad1c @@ -36,7 +36,6 @@ require ( require ( github.com/HdrHistogram/hdrhistogram-go v1.1.2 // indirect - github.com/apache/arrow/go/v16 v16.1.0 // indirect github.com/axiomhq/hyperloglog v0.0.0-20230201085229-3ddf4bad03dc // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect diff --git a/exporter/otelarrowexporter/go.sum b/exporter/otelarrowexporter/go.sum index 576b63750d98..62bc951ca84f 100644 --- a/exporter/otelarrowexporter/go.sum +++ b/exporter/otelarrowexporter/go.sum @@ -3,8 +3,6 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/HdrHistogram/hdrhistogram-go v1.1.2 h1:5IcZpTvzydCQeHzK4Ef/D5rrSqwxob0t8PQPMybUNFM= github.com/HdrHistogram/hdrhistogram-go v1.1.2/go.mod h1:yDgFjdqOqDEKOvasDdhWNXYg9BVp4O+o5f6V/ehm6Oo= github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= -github.com/apache/arrow/go/v14 v14.0.2 h1:N8OkaJEOfI3mEZt07BIkvo4sC6XDbL+48MBPWO5IONw= -github.com/apache/arrow/go/v14 v14.0.2/go.mod h1:u3fgh3EdgN/YQ8cVQRguVW3R+seMybFg8QBQ5LU+eBY= github.com/apache/arrow/go/v16 v16.1.0 h1:dwgfOya6s03CzH9JrjCBx6bkVb4yPD4ma3haj9p7FXI= github.com/apache/arrow/go/v16 v16.1.0/go.mod h1:9wnc9mn6vEDTRIm4+27pEjQpRKuTvBaessPoEXQzxWA= github.com/axiomhq/hyperloglog v0.0.0-20230201085229-3ddf4bad03dc h1:Keo7wQ7UODUaHcEi7ltENhbAK2VgZjfat6mLy03tQzo= @@ -88,8 +86,8 @@ github.com/mostynb/go-grpc-compression v1.2.3/go.mod h1:AghIxF3P57umzqM9yz795+y1 github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/open-telemetry/otel-arrow v0.24.0 h1:hNUEbwHW/1gEOUiN+HoI+ITiXe2vSBaPWlE9FRwJwDE= github.com/open-telemetry/otel-arrow v0.24.0/go.mod h1:uzoHixEh6CUBZkP+vkRvyiHYUnYsAOUwCcfByQkSMM0= -github.com/open-telemetry/otel-arrow/collector v0.23.0 h1:ztmq1ipJBhm4xWjHDbmKOtgP3Nl/ZDoLX+3ThhzFs6k= -github.com/open-telemetry/otel-arrow/collector v0.23.0/go.mod h1:SLgLEhhcfR9MjG1taK8RPuwiuIoAPW7IpCjFBobwIUM= +github.com/open-telemetry/otel-arrow/collector v0.24.0 h1:NYTcgtwG0lQnoGcEomTTtueZxzk03xt+XEXN4L5kqHA= +github.com/open-telemetry/otel-arrow/collector v0.24.0/go.mod h1:+jJ3Vfhh685hXSw2Z1P1wl/rTqEKlSaJ4FocZI+xs+0= github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/exporter/otelarrowexporter/internal/arrow/bestofn.go b/exporter/otelarrowexporter/internal/arrow/bestofn.go index ae4bce633643..443713cda815 100644 --- a/exporter/otelarrowexporter/internal/arrow/bestofn.go +++ b/exporter/otelarrowexporter/internal/arrow/bestofn.go @@ -8,6 +8,10 @@ import ( "math/rand" "runtime" "sort" + "time" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) // bestOfNPrioritizer is a prioritizer that selects a less-loaded stream to write. @@ -42,7 +46,7 @@ type streamSorter struct { var _ streamPrioritizer = &bestOfNPrioritizer{} -func newBestOfNPrioritizer(dc doneCancel, numChoices, numStreams int, lf loadFunc) (*bestOfNPrioritizer, []*streamWorkState) { +func newBestOfNPrioritizer(dc doneCancel, numChoices, numStreams int, lf loadFunc, maxLifetime time.Duration) (*bestOfNPrioritizer, []*streamWorkState) { var state []*streamWorkState // Limit numChoices to the number of streams. @@ -50,8 +54,9 @@ func newBestOfNPrioritizer(dc doneCancel, numChoices, numStreams int, lf loadFun for i := 0; i < numStreams; i++ { ws := &streamWorkState{ - waiters: map[int64]chan<- error{}, - toWrite: make(chan writeItem, 1), + maxStreamLifetime: addJitter(maxLifetime), + waiters: map[int64]chan<- error{}, + toWrite: make(chan writeItem, 1), } state = append(state, ws) @@ -112,7 +117,7 @@ func (lp *bestOfNPrioritizer) sendAndWait(ctx context.Context, errCh <-chan erro case <-lp.done: return ErrStreamRestarting case <-ctx.Done(): - return context.Canceled + return status.Errorf(codes.Canceled, "stream wait: %v", ctx.Err()) case lp.input <- wri: return waitForWrite(ctx, errCh, lp.done) } diff --git a/exporter/otelarrowexporter/internal/arrow/exporter.go b/exporter/otelarrowexporter/internal/arrow/exporter.go index 18b3259d3b4a..8c08bf588a51 100644 --- a/exporter/otelarrowexporter/internal/arrow/exporter.go +++ b/exporter/otelarrowexporter/internal/arrow/exporter.go @@ -20,7 +20,9 @@ import ( "go.opentelemetry.io/collector/pdata/ptrace" "go.uber.org/zap" "google.golang.org/grpc" + "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials" + "google.golang.org/grpc/status" ) // Exporter is 1:1 with exporter, isolates arrow-specific @@ -32,9 +34,7 @@ type Exporter struct { // prioritizerName the name of a balancer policy. prioritizerName PrioritizerName - // maxStreamLifetime is a limit on duration for streams. A - // slight "jitter" is applied relative to this value on a - // per-stream basis. + // maxStreamLifetime is a limit on duration for streams. maxStreamLifetime time.Duration // disableDowngrade prevents downgrade from occurring, supports @@ -156,7 +156,7 @@ func (e *Exporter) Start(ctx context.Context) error { downCtx, downDc := newDoneCancel(ctx) var sws []*streamWorkState - e.ready, sws = newStreamPrioritizer(downDc, e.prioritizerName, e.numStreams) + e.ready, sws = newStreamPrioritizer(downDc, e.prioritizerName, e.numStreams, e.maxStreamLifetime) for _, ws := range sws { e.startArrowStream(downCtx, ws) @@ -236,7 +236,6 @@ func (e *Exporter) runArrowStream(ctx context.Context, dc doneCancel, state *str producer := e.newProducer() stream := newStream(producer, e.ready, e.telemetry, e.netReporter, state) - stream.maxStreamLifetime = addJitter(e.maxStreamLifetime) defer func() { if err := producer.Close(); err != nil { @@ -258,6 +257,14 @@ func (e *Exporter) runArrowStream(ctx context.Context, dc doneCancel, state *str // // consumer should fall back to standard OTLP, (true, nil) func (e *Exporter) SendAndWait(ctx context.Context, data any) (bool, error) { + // If the incoming context is already canceled, return the + // same error condition a unary gRPC or HTTP exporter would do. + select { + case <-ctx.Done(): + return false, status.Errorf(codes.Canceled, "context done before send: %v", ctx.Err()) + default: + } + errCh := make(chan error, 1) // Note that if the OTLP exporter's gRPC Headers field was @@ -343,7 +350,7 @@ func waitForWrite(ctx context.Context, errCh <-chan error, down <-chan struct{}) select { case <-ctx.Done(): // This caller's context timed out. - return ctx.Err() + return status.Errorf(codes.Canceled, "send wait: %v", ctx.Err()) case <-down: return ErrStreamRestarting case err := <-errCh: diff --git a/exporter/otelarrowexporter/internal/arrow/exporter_test.go b/exporter/otelarrowexporter/internal/arrow/exporter_test.go index 276e5f3fa437..40de3857c5af 100644 --- a/exporter/otelarrowexporter/internal/arrow/exporter_test.go +++ b/exporter/otelarrowexporter/internal/arrow/exporter_test.go @@ -6,7 +6,6 @@ package arrow import ( "context" "encoding/json" - "errors" "fmt" "sync" "sync/atomic" @@ -31,7 +30,9 @@ import ( "go.uber.org/zap/zaptest" "golang.org/x/net/http2/hpack" "google.golang.org/grpc" + "google.golang.org/grpc/codes" "google.golang.org/grpc/metadata" + "google.golang.org/grpc/status" ) var AllPrioritizers = []PrioritizerName{LeastLoadedPrioritizer, LeastLoadedTwoPrioritizer} @@ -278,7 +279,18 @@ func TestArrowExporterTimeout(t *testing.T) { sent, err := tc.exporter.SendAndWait(ctx, twoTraces) require.True(t, sent) require.Error(t, err) - require.True(t, errors.Is(err, context.Canceled)) + + stat, is := status.FromError(err) + require.True(t, is, "is a gRPC status") + require.Equal(t, codes.Canceled, stat.Code()) + + // Repeat the request, will get immediate timeout. + sent, err = tc.exporter.SendAndWait(ctx, twoTraces) + require.False(t, sent) + stat, is = status.FromError(err) + require.True(t, is, "is a gRPC status error: %v", err) + require.Equal(t, "context done before send: context canceled", stat.Message()) + require.Equal(t, codes.Canceled, stat.Code()) require.NoError(t, tc.exporter.Shutdown(ctx)) }) @@ -406,7 +418,10 @@ func TestArrowExporterConnectTimeout(t *testing.T) { }() _, err := tc.exporter.SendAndWait(ctx, twoTraces) require.Error(t, err) - require.True(t, errors.Is(err, context.Canceled)) + + stat, is := status.FromError(err) + require.True(t, is, "is a gRPC status error: %v", err) + require.Equal(t, codes.Canceled, stat.Code()) require.NoError(t, tc.exporter.Shutdown(bg)) }) @@ -489,7 +504,10 @@ func TestArrowExporterStreamRace(t *testing.T) { // This blocks until the cancelation. _, err := tc.exporter.SendAndWait(callctx, twoTraces) require.Error(t, err) - require.True(t, errors.Is(err, context.Canceled)) + + stat, is := status.FromError(err) + require.True(t, is, "is a gRPC status error: %v", err) + require.Equal(t, codes.Canceled, stat.Code()) }() } diff --git a/exporter/otelarrowexporter/internal/arrow/prioritizer.go b/exporter/otelarrowexporter/internal/arrow/prioritizer.go index 84220338348f..551b9f781fc3 100644 --- a/exporter/otelarrowexporter/internal/arrow/prioritizer.go +++ b/exporter/otelarrowexporter/internal/arrow/prioritizer.go @@ -8,6 +8,7 @@ import ( "fmt" "strconv" "strings" + "time" "go.opentelemetry.io/collector/component" "google.golang.org/grpc/codes" @@ -50,7 +51,7 @@ type streamWriter interface { sendAndWait(context.Context, <-chan error, writeItem) error } -func newStreamPrioritizer(dc doneCancel, name PrioritizerName, numStreams int) (streamPrioritizer, []*streamWorkState) { +func newStreamPrioritizer(dc doneCancel, name PrioritizerName, numStreams int, maxLifetime time.Duration) (streamPrioritizer, []*streamWorkState) { if name == unsetPrioritizer { name = DefaultPrioritizer } @@ -58,10 +59,10 @@ func newStreamPrioritizer(dc doneCancel, name PrioritizerName, numStreams int) ( // error was checked and reported in Validate n, err := strconv.Atoi(string(name[len(llPrefix):])) if err == nil { - return newBestOfNPrioritizer(dc, n, numStreams, pendingRequests) + return newBestOfNPrioritizer(dc, n, numStreams, pendingRequests, maxLifetime) } } - return newBestOfNPrioritizer(dc, numStreams, numStreams, pendingRequests) + return newBestOfNPrioritizer(dc, numStreams, numStreams, pendingRequests, maxLifetime) } // pendingRequests is the load function used by leastloadedN. diff --git a/exporter/otelarrowexporter/internal/arrow/stream.go b/exporter/otelarrowexporter/internal/arrow/stream.go index 7070d8c6ea42..ecedd88ae105 100644 --- a/exporter/otelarrowexporter/internal/arrow/stream.go +++ b/exporter/otelarrowexporter/internal/arrow/stream.go @@ -7,7 +7,6 @@ import ( "bytes" "context" "errors" - "fmt" "io" "sync" "time" @@ -16,7 +15,6 @@ import ( "github.com/open-telemetry/otel-arrow/collector/netstats" arrowRecord "github.com/open-telemetry/otel-arrow/pkg/otel/arrow_record" "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/consumer/consumererror" "go.opentelemetry.io/collector/pdata/plog" "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/pdata/ptrace" @@ -34,12 +32,6 @@ import ( // Stream is 1:1 with gRPC stream. type Stream struct { - // maxStreamLifetime is the max timeout before stream - // should be closed on the client side. This ensures a - // graceful shutdown before max_connection_age is reached - // on the server side. - maxStreamLifetime time.Duration - // producer is exclusive to the holder of the stream. producer arrowRecord.ProducerAPI @@ -77,6 +69,11 @@ type streamWorkState struct { // prioritizer and a stream. toWrite chan writeItem + // maxStreamLifetime is a limit on duration for streams. A + // slight "jitter" is applied relative to this value on a + // per-stream basis. + maxStreamLifetime time.Duration + // lock protects waiters lock sync.Mutex @@ -130,9 +127,9 @@ func (s *Stream) setBatchChannel(batchID int64, errCh chan<- error) { s.workState.waiters[batchID] = errCh } -// logStreamError decides how to log an error. `which` indicates the -// stream direction, will be "reader" or "writer". -func (s *Stream) logStreamError(which string, err error) { +// logStreamError decides how to log an error. `where` indicates the +// error location, will be "reader" or "writer". +func (s *Stream) logStreamError(where string, err error) { var code codes.Code var msg string // gRPC tends to supply status-wrapped errors, so we always @@ -151,9 +148,9 @@ func (s *Stream) logStreamError(which string, err error) { msg = err.Error() } if code == codes.Canceled { - s.telemetry.Logger.Debug("arrow stream shutdown", zap.String("which", which), zap.String("message", msg)) + s.telemetry.Logger.Debug("arrow stream shutdown", zap.String("message", msg), zap.String("where", where)) } else { - s.telemetry.Logger.Error("arrow stream error", zap.String("which", which), zap.String("message", msg), zap.Int("code", int(code))) + s.telemetry.Logger.Error("arrow stream error", zap.Int("code", int(code)), zap.String("message", msg), zap.String("where", where)) } } @@ -254,8 +251,8 @@ func (s *Stream) write(ctx context.Context) (retErr error) { hdrsEnc := hpack.NewEncoder(&hdrsBuf) var timerCh <-chan time.Time - if s.maxStreamLifetime != 0 { - timer := time.NewTimer(s.maxStreamLifetime) + if s.workState.maxStreamLifetime != 0 { + timer := time.NewTimer(s.workState.maxStreamLifetime) timerCh = timer.C defer timer.Stop() } @@ -269,7 +266,7 @@ func (s *Stream) write(ctx context.Context) (retErr error) { return nil case wri = <-s.workState.toWrite: case <-ctx.Done(): - return ctx.Err() + return status.Errorf(codes.Canceled, "stream input: %v", ctx.Err()) } err := s.encodeAndSend(wri, &hdrsBuf, hdrsEnc) @@ -314,8 +311,8 @@ func (s *Stream) encodeAndSend(wri writeItem, hdrsBuf *bytes.Buffer, hdrsEnc *hp if err != nil { // This is some kind of internal error. We will restart the // stream and mark this record as a permanent one. - err = fmt.Errorf("encode: %w", err) - wri.errCh <- consumererror.NewPermanent(err) + err = status.Errorf(codes.Internal, "encode: %v", err) + wri.errCh <- err return err } @@ -331,8 +328,8 @@ func (s *Stream) encodeAndSend(wri writeItem, hdrsBuf *bytes.Buffer, hdrsEnc *hp // This case is like the encode-failure case // above, we will restart the stream but consider // this a permenent error. - err = fmt.Errorf("hpack: %w", err) - wri.errCh <- consumererror.NewPermanent(err) + err = status.Errorf(codes.Internal, "hpack: %v", err) + wri.errCh <- err return err } } @@ -346,13 +343,10 @@ func (s *Stream) encodeAndSend(wri writeItem, hdrsBuf *bytes.Buffer, hdrsEnc *hp // unreliable for arrow transport, so we instrument it // directly here. Only the primary direction of transport // is instrumented this way. - if wri.uncompSize != 0 { - var sized netstats.SizesStruct - sized.Method = s.method - sized.Length = int64(wri.uncompSize) - s.netReporter.CountSend(ctx, sized) - s.netReporter.SetSpanSizeAttributes(ctx, sized) - } + var sized netstats.SizesStruct + sized.Method = s.method + sized.Length = int64(wri.uncompSize) + s.netReporter.CountSend(ctx, sized) if err := s.client.Send(batch); err != nil { // The error will be sent to errCh during cleanup for this stream. @@ -380,24 +374,24 @@ func (s *Stream) read(_ context.Context) error { } if err = s.processBatchStatus(resp); err != nil { - return fmt.Errorf("process: %w", err) + return err } } } // getSenderChannel takes the stream lock and removes the corresonding // sender channel. -func (sws *streamWorkState) getSenderChannel(status *arrowpb.BatchStatus) (chan<- error, error) { +func (sws *streamWorkState) getSenderChannel(bstat *arrowpb.BatchStatus) (chan<- error, error) { sws.lock.Lock() defer sws.lock.Unlock() - ch, ok := sws.waiters[status.BatchId] + ch, ok := sws.waiters[bstat.BatchId] if !ok { // Will break the stream. - return nil, fmt.Errorf("unrecognized batch ID: %d", status.BatchId) + return nil, status.Errorf(codes.Internal, "unrecognized batch ID: %d", bstat.BatchId) } - delete(sws.waiters, status.BatchId) + delete(sws.waiters, bstat.BatchId) return ch, nil } @@ -458,7 +452,7 @@ func (s *Stream) encode(records any) (_ *arrowpb.BatchArrowRecords, retErr error zap.Reflect("recovered", err), zap.Stack("stacktrace"), ) - retErr = fmt.Errorf("panic in otel-arrow-adapter: %v", err) + retErr = status.Errorf(codes.Internal, "panic in otel-arrow-adapter: %v", err) } }() var batch *arrowpb.BatchArrowRecords @@ -471,7 +465,7 @@ func (s *Stream) encode(records any) (_ *arrowpb.BatchArrowRecords, retErr error case pmetric.Metrics: batch, err = s.producer.BatchArrowRecordsFromMetrics(data) default: - return nil, fmt.Errorf("unsupported OTLP type: %T", records) + return nil, status.Errorf(codes.Unimplemented, "unsupported OTel-Arrow signal type: %T", records) } return batch, err } diff --git a/exporter/otelarrowexporter/internal/arrow/stream_test.go b/exporter/otelarrowexporter/internal/arrow/stream_test.go index e916667c455c..416f80864b17 100644 --- a/exporter/otelarrowexporter/internal/arrow/stream_test.go +++ b/exporter/otelarrowexporter/internal/arrow/stream_test.go @@ -15,9 +15,10 @@ import ( "github.com/open-telemetry/otel-arrow/collector/netstats" arrowRecordMock "github.com/open-telemetry/otel-arrow/pkg/otel/arrow_record/mock" "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/consumer/consumererror" "go.uber.org/mock/gomock" "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) var oneBatch = &arrowpb.BatchArrowRecords{ @@ -44,7 +45,7 @@ func newStreamTestCase(t *testing.T, pname PrioritizerName) *streamTestCase { producer := arrowRecordMock.NewMockProducerAPI(ctrl) bg, dc := newDoneCancel(context.Background()) - prio, state := newStreamPrioritizer(dc, pname, 1) + prio, state := newStreamPrioritizer(dc, pname, 1, 10*time.Second) ctc := newCommonTestCase(t, NotNoisy) cts := ctc.newMockStream(bg) @@ -53,7 +54,6 @@ func newStreamTestCase(t *testing.T, pname PrioritizerName) *streamTestCase { ctc.requestMetadataCall.AnyTimes().Return(nil, nil) stream := newStream(producer, prio, ctc.telset, netstats.Noop{}, state[0]) - stream.maxStreamLifetime = 10 * time.Second fromTracesCall := producer.EXPECT().BatchArrowRecordsFromTraces(gomock.Any()).Times(0) fromMetricsCall := producer.EXPECT().BatchArrowRecordsFromMetrics(gomock.Any()).Times(0) @@ -143,7 +143,6 @@ func TestStreamNoMaxLifetime(t *testing.T) { t.Run(string(pname), func(t *testing.T) { tc := newStreamTestCase(t, pname) - tc.stream.maxStreamLifetime = 0 tc.fromTracesCall.Times(1).Return(oneBatch, nil) tc.closeSendCall.Times(0) @@ -182,8 +181,12 @@ func TestStreamEncodeError(t *testing.T) { // sender should get a permanent testErr err := tc.mustSendAndWait() require.Error(t, err) - require.True(t, errors.Is(err, testErr)) - require.True(t, consumererror.IsPermanent(err)) + + stat, is := status.FromError(err) + require.True(t, is, "is a gRPC status error: %v", err) + require.Equal(t, codes.Internal, stat.Code()) + + require.Contains(t, stat.Message(), testErr.Error()) }) } } diff --git a/exporter/otelarrowexporter/otelarrow.go b/exporter/otelarrowexporter/otelarrow.go index e3f0a76fe6b3..67beb5dee75e 100644 --- a/exporter/otelarrowexporter/otelarrow.go +++ b/exporter/otelarrowexporter/otelarrow.go @@ -10,7 +10,7 @@ import ( "runtime" "time" - arrowPkg "github.com/apache/arrow/go/v14/arrow" + arrowPkg "github.com/apache/arrow/go/v16/arrow" "github.com/open-telemetry/otel-arrow/collector/compression/zstd" "github.com/open-telemetry/otel-arrow/collector/netstats" arrowRecord "github.com/open-telemetry/otel-arrow/pkg/otel/arrow_record" diff --git a/exporter/otelarrowexporter/otelarrow_test.go b/exporter/otelarrowexporter/otelarrow_test.go index 558f45b43b8b..15b813550b82 100644 --- a/exporter/otelarrowexporter/otelarrow_test.go +++ b/exporter/otelarrowexporter/otelarrow_test.go @@ -935,10 +935,6 @@ func testSendArrowTraces(t *testing.T, clientWaitForReady, streamServiceAvailabl require.NoError(t, err) require.NotNil(t, exp) - defer func() { - assert.NoError(t, exp.Shutdown(context.Background())) - }() - type isUserCall struct{} host := newHostWithExtensions( @@ -956,6 +952,15 @@ func testSendArrowTraces(t *testing.T, clientWaitForReady, streamServiceAvailabl assert.NoError(t, exp.Start(context.Background(), host)) rcv, _ := otelArrowTracesReceiverOnGRPCServer(ln, false) + + defer func() { + // Shutdown before GracefulStop, because otherwise we + // wait for a full stream lifetime instead of closing + // after requests are served. + assert.NoError(t, exp.Shutdown(context.Background())) + rcv.srv.GracefulStop() + }() + if streamServiceAvailable { rcv.startStreamMockArrowTraces(t, okStatusFor) } @@ -988,8 +993,6 @@ func testSendArrowTraces(t *testing.T, clientWaitForReady, streamServiceAvailabl md := rcv.getMetadata() require.EqualValues(t, []string{"arrow"}, md.Get("callerid")) require.EqualValues(t, expectedHeader, md.Get("header")) - - rcv.srv.GracefulStop() } func okStatusFor(id int64) *arrowpb.BatchStatus { @@ -1102,16 +1105,17 @@ func TestSendArrowFailedTraces(t *testing.T) { require.NoError(t, err) require.NotNil(t, exp) - defer func() { - assert.NoError(t, exp.Shutdown(context.Background())) - }() - host := componenttest.NewNopHost() assert.NoError(t, exp.Start(context.Background(), host)) rcv, _ := otelArrowTracesReceiverOnGRPCServer(ln, false) rcv.startStreamMockArrowTraces(t, failedStatusFor) + defer func() { + assert.NoError(t, exp.Shutdown(context.Background())) + rcv.srv.GracefulStop() + }() + // Delay the server start, slightly. go func() { time.Sleep(100 * time.Millisecond) @@ -1133,8 +1137,6 @@ func TestSendArrowFailedTraces(t *testing.T) { assert.EqualValues(t, int32(2), rcv.totalItems.Load()) assert.EqualValues(t, int32(1), rcv.requestCount.Load()) assert.EqualValues(t, td, rcv.getLastRequest()) - - rcv.srv.GracefulStop() } func TestUserDialOptions(t *testing.T) { diff --git a/receiver/otelarrowreceiver/README.md b/receiver/otelarrowreceiver/README.md index f0fee43036c4..043a2a8100a5 100644 --- a/receiver/otelarrowreceiver/README.md +++ b/receiver/otelarrowreceiver/README.md @@ -87,6 +87,13 @@ When the limit is reached, the receiver will return RESOURCE_EXHAUSTED error codes to the receiver, which are [conditionally retryable, see exporter retry configuration](https://github.com/open-telemetry/opentelemetry-collector/blob/main/exporter/exporterhelper/README.md). +- `admission_limit_mib` (default: 64): limits the number of requests that are received by the stream based on request size information available. This should not be confused with `memory_limit_mib` which limits allocations made by the consumer when translating arrow records into pdata objects. i.e. request size is used to control how much traffic we admit, but does not control how much memory is used during request processing. + +- `waiter_limit` (default: 1000): limits the number of requests waiting on admission once `admission_limit_mib` is reached. This is another dimension of memory limiting that ensures waiters are not holding onto a significant amount of memory while waiting to be processed. + +`admission_limit_mib` and `waiter_limit` are arguments supplied to [admission.BoundedQueue](https://github.com/open-telemetry/otel-arrow/tree/main/collector/admission). This custom semaphore is meant to be used within receivers to help limit memory within the collector pipeline. + + ### Compression Configuration In the `arrow` configuration block, `zstd` sub-section applies to all diff --git a/receiver/otelarrowreceiver/config.go b/receiver/otelarrowreceiver/config.go index 38b8e2a1aa2c..8361c0ba7bc8 100644 --- a/receiver/otelarrowreceiver/config.go +++ b/receiver/otelarrowreceiver/config.go @@ -24,6 +24,16 @@ type ArrowConfig struct { // passing through, they will see ResourceExhausted errors. MemoryLimitMiB uint64 `mapstructure:"memory_limit_mib"` + // AdmissionLimitMiB limits the number of requests that are received by the stream based on + // request size information available. Request size is used to control how much traffic we admit + // for processing, but does not control how much memory is used during request processing. + AdmissionLimitMiB uint64 `mapstructure:"admission_limit_mib"` + + // WaiterLimit is the limit on the number of waiters waiting to be processed and consumed. + // This is a dimension of memory limiting to ensure waiters are not consuming an + // unexpectedly large amount of memory in the arrow receiver. + WaiterLimit int64 `mapstructure:"waiter_limit"` + // Zstd settings apply to OTel-Arrow use of gRPC specifically. Zstd zstd.DecoderConfig `mapstructure:"zstd"` } diff --git a/receiver/otelarrowreceiver/config_test.go b/receiver/otelarrowreceiver/config_test.go index f756e1b6b1a5..fc25f24abd89 100644 --- a/receiver/otelarrowreceiver/config_test.go +++ b/receiver/otelarrowreceiver/config_test.go @@ -77,7 +77,9 @@ func TestUnmarshalConfig(t *testing.T) { }, }, Arrow: ArrowConfig{ - MemoryLimitMiB: 123, + MemoryLimitMiB: 123, + AdmissionLimitMiB: 80, + WaiterLimit: 100, }, }, }, cfg) @@ -101,7 +103,9 @@ func TestUnmarshalConfigUnix(t *testing.T) { ReadBufferSize: 512 * 1024, }, Arrow: ArrowConfig{ - MemoryLimitMiB: defaultMemoryLimitMiB, + MemoryLimitMiB: defaultMemoryLimitMiB, + AdmissionLimitMiB: defaultAdmissionLimitMiB, + WaiterLimit: defaultWaiterLimit, }, }, }, cfg) diff --git a/receiver/otelarrowreceiver/factory.go b/receiver/otelarrowreceiver/factory.go index e59c8fdc5d77..48279d76cc3a 100644 --- a/receiver/otelarrowreceiver/factory.go +++ b/receiver/otelarrowreceiver/factory.go @@ -19,7 +19,9 @@ import ( const ( defaultGRPCEndpoint = "0.0.0.0:4317" - defaultMemoryLimitMiB = 128 + defaultMemoryLimitMiB = 128 + defaultAdmissionLimitMiB = defaultMemoryLimitMiB / 2 + defaultWaiterLimit = 1000 ) // NewFactory creates a new OTel-Arrow receiver factory. @@ -45,7 +47,9 @@ func createDefaultConfig() component.Config { ReadBufferSize: 512 * 1024, }, Arrow: ArrowConfig{ - MemoryLimitMiB: defaultMemoryLimitMiB, + MemoryLimitMiB: defaultMemoryLimitMiB, + AdmissionLimitMiB: defaultAdmissionLimitMiB, + WaiterLimit: defaultWaiterLimit, }, }, } @@ -108,7 +112,7 @@ func createLog( return r, nil } -// This is the map of already created OTLP receivers for particular configurations. +// This is the map of already created OTel-Arrow receivers for particular configurations. // We maintain this map because the Factory is asked trace and metric receivers separately // when it gets CreateTracesReceiver() and CreateMetricsReceiver() but they must not // create separate objects, they must use one otelArrowReceiver object per configuration. diff --git a/receiver/otelarrowreceiver/go.mod b/receiver/otelarrowreceiver/go.mod index a3023565ad41..fc0d932e293b 100644 --- a/receiver/otelarrowreceiver/go.mod +++ b/receiver/otelarrowreceiver/go.mod @@ -4,7 +4,7 @@ go 1.21.0 require ( github.com/open-telemetry/otel-arrow v0.24.0 - github.com/open-telemetry/otel-arrow/collector v0.23.0 + github.com/open-telemetry/otel-arrow/collector v0.24.0 github.com/stretchr/testify v1.9.0 go.opentelemetry.io/collector v0.102.2-0.20240611143128-7dfb57b9ad1c go.opentelemetry.io/collector/component v0.102.2-0.20240611143128-7dfb57b9ad1c @@ -28,13 +28,16 @@ require ( go.uber.org/zap v1.27.0 golang.org/x/net v0.26.0 google.golang.org/grpc v1.64.0 + google.golang.org/protobuf v1.34.2 ) require ( github.com/HdrHistogram/hdrhistogram-go v1.1.2 // indirect github.com/apache/arrow/go/v16 v16.1.0 // indirect github.com/axiomhq/hyperloglog v0.0.0-20230201085229-3ddf4bad03dc // indirect + github.com/bahlo/generic-list-go v0.2.0 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/buger/jsonparser v1.1.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc // indirect @@ -55,6 +58,7 @@ require ( github.com/knadh/koanf/maps v0.1.1 // indirect github.com/knadh/koanf/providers/confmap v0.1.0 // indirect github.com/knadh/koanf/v2 v2.1.1 // indirect + github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect @@ -66,6 +70,7 @@ require ( github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.54.0 // indirect github.com/prometheus/procfs v0.15.0 // indirect + github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect github.com/x448/float16 v0.8.4 // indirect github.com/zeebo/xxh3 v1.0.2 // indirect go.opentelemetry.io/collector/config/configcompression v1.9.1-0.20240611143128-7dfb57b9ad1c // indirect @@ -85,6 +90,5 @@ require ( golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240520151616-dc85e6b867a5 // indirect - google.golang.org/protobuf v1.34.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/receiver/otelarrowreceiver/go.sum b/receiver/otelarrowreceiver/go.sum index db2d6d9f1446..1e920d341328 100644 --- a/receiver/otelarrowreceiver/go.sum +++ b/receiver/otelarrowreceiver/go.sum @@ -7,10 +7,14 @@ github.com/apache/arrow/go/v16 v16.1.0 h1:dwgfOya6s03CzH9JrjCBx6bkVb4yPD4ma3haj9 github.com/apache/arrow/go/v16 v16.1.0/go.mod h1:9wnc9mn6vEDTRIm4+27pEjQpRKuTvBaessPoEXQzxWA= github.com/axiomhq/hyperloglog v0.0.0-20230201085229-3ddf4bad03dc h1:Keo7wQ7UODUaHcEi7ltENhbAK2VgZjfat6mLy03tQzo= github.com/axiomhq/hyperloglog v0.0.0-20230201085229-3ddf4bad03dc/go.mod h1:k08r+Yj1PRAmuayFiRK6MYuR5Ve4IuZtTfxErMIh0+c= +github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk= +github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/brianvoe/gofakeit/v6 v6.17.0 h1:obbQTJeHfktJtiZzq0Q1bEpsNUs+yHrYlPVWt7BtmJ4= github.com/brianvoe/gofakeit/v6 v6.17.0/go.mod h1:Ow6qC71xtwm79anlwKRlWZW6zVq9D2XHE4QSSMP/rU8= +github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs= +github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= @@ -49,6 +53,7 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/go-version v1.7.0 h1:5tqGy27NaOTB8yJKUZELlFAS/LTKJkrmONwQKeRZfjY= github.com/hashicorp/go-version v1.7.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= @@ -70,6 +75,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= @@ -84,8 +91,8 @@ github.com/mostynb/go-grpc-compression v1.2.3/go.mod h1:AghIxF3P57umzqM9yz795+y1 github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/open-telemetry/otel-arrow v0.24.0 h1:hNUEbwHW/1gEOUiN+HoI+ITiXe2vSBaPWlE9FRwJwDE= github.com/open-telemetry/otel-arrow v0.24.0/go.mod h1:uzoHixEh6CUBZkP+vkRvyiHYUnYsAOUwCcfByQkSMM0= -github.com/open-telemetry/otel-arrow/collector v0.23.0 h1:ztmq1ipJBhm4xWjHDbmKOtgP3Nl/ZDoLX+3ThhzFs6k= -github.com/open-telemetry/otel-arrow/collector v0.23.0/go.mod h1:SLgLEhhcfR9MjG1taK8RPuwiuIoAPW7IpCjFBobwIUM= +github.com/open-telemetry/otel-arrow/collector v0.24.0 h1:NYTcgtwG0lQnoGcEomTTtueZxzk03xt+XEXN4L5kqHA= +github.com/open-telemetry/otel-arrow/collector v0.24.0/go.mod h1:+jJ3Vfhh685hXSw2Z1P1wl/rTqEKlSaJ4FocZI+xs+0= github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -105,6 +112,8 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc= +github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= diff --git a/receiver/otelarrowreceiver/internal/arrow/arrow.go b/receiver/otelarrowreceiver/internal/arrow/arrow.go index c3808c749f41..1aee357d9df1 100644 --- a/receiver/otelarrowreceiver/internal/arrow/arrow.go +++ b/receiver/otelarrowreceiver/internal/arrow/arrow.go @@ -8,15 +8,20 @@ import ( "errors" "fmt" "io" + "net" + "runtime" + "strconv" "strings" + "sync" + "sync/atomic" arrowpb "github.com/open-telemetry/otel-arrow/api/experimental/arrow/v1" + "github.com/open-telemetry/otel-arrow/collector/admission" "github.com/open-telemetry/otel-arrow/collector/netstats" arrowRecord "github.com/open-telemetry/otel-arrow/pkg/otel/arrow_record" "go.opentelemetry.io/collector/client" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/config/configgrpc" - "go.opentelemetry.io/collector/config/configtelemetry" "go.opentelemetry.io/collector/consumer" "go.opentelemetry.io/collector/consumer/consumererror" "go.opentelemetry.io/collector/extension/auth" @@ -27,6 +32,7 @@ import ( "go.opentelemetry.io/collector/receiver/receiverhelper" "go.opentelemetry.io/otel" otelcodes "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" "go.opentelemetry.io/otel/trace" "go.uber.org/multierr" @@ -36,20 +42,20 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/metadata" "google.golang.org/grpc/status" - - md "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelarrowreceiver/internal/metadata" + "google.golang.org/protobuf/proto" ) const ( streamFormat = "arrow" hpackMaxDynamicSize = 4096 + scopeName = "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelarrowreceiver" ) var ( ErrNoMetricsConsumer = fmt.Errorf("no metrics consumer") ErrNoLogsConsumer = fmt.Errorf("no logs consumer") ErrNoTracesConsumer = fmt.Errorf("no traces consumer") - ErrUnrecognizedPayload = fmt.Errorf("unrecognized OTLP payload") + ErrUnrecognizedPayload = consumererror.NewPermanent(fmt.Errorf("unrecognized OTel-Arrow payload")) ) type Consumers interface { @@ -65,14 +71,23 @@ type Receiver struct { arrowpb.UnsafeArrowLogsServiceServer arrowpb.UnsafeArrowMetricsServiceServer - telemetry component.TelemetrySettings - tracer trace.Tracer - obsrecv *receiverhelper.ObsReport - gsettings configgrpc.ServerConfig - authServer auth.Server - newConsumer func() arrowRecord.ConsumerAPI - netReporter netstats.Interface - telemetryBuilder *md.TelemetryBuilder + telemetry component.TelemetrySettings + tracer trace.Tracer + obsrecv *receiverhelper.ObsReport + gsettings configgrpc.ServerConfig + authServer auth.Server + newConsumer func() arrowRecord.ConsumerAPI + netReporter netstats.Interface + recvInFlightBytes metric.Int64UpDownCounter + recvInFlightItems metric.Int64UpDownCounter + recvInFlightRequests metric.Int64UpDownCounter + boundedQueue *admission.BoundedQueue +} + +// receiverStream holds the inFlightWG for a single stream. +type receiverStream struct { + *Receiver + inFlightWG sync.WaitGroup } // New creates a new Receiver reference. @@ -83,23 +98,48 @@ func New( gsettings configgrpc.ServerConfig, authServer auth.Server, newConsumer func() arrowRecord.ConsumerAPI, + bq *admission.BoundedQueue, netReporter netstats.Interface, ) (*Receiver, error) { - telemetryBuilder, err := md.NewTelemetryBuilder(set.TelemetrySettings) - if err != nil { - return nil, err + tracer := set.TelemetrySettings.TracerProvider.Tracer("otel-arrow-receiver") + var errors, err error + recv := &Receiver{ + Consumers: cs, + obsrecv: obsrecv, + telemetry: set.TelemetrySettings, + tracer: tracer, + authServer: authServer, + newConsumer: newConsumer, + gsettings: gsettings, + netReporter: netReporter, + boundedQueue: bq, + } + + meter := recv.telemetry.MeterProvider.Meter(scopeName) + recv.recvInFlightBytes, err = meter.Int64UpDownCounter( + "otel_arrow_receiver_in_flight_bytes", + metric.WithDescription("Number of bytes in flight"), + metric.WithUnit("By"), + ) + errors = multierr.Append(errors, err) + + recv.recvInFlightItems, err = meter.Int64UpDownCounter( + "otel_arrow_receiver_in_flight_items", + metric.WithDescription("Number of items in flight"), + ) + errors = multierr.Append(errors, err) + + recv.recvInFlightRequests, err = meter.Int64UpDownCounter( + "otel_arrow_receiver_in_flight_requests", + metric.WithDescription("Number of requests in flight"), + ) + errors = multierr.Append(errors, err) + + if errors != nil { + return nil, errors } - return &Receiver{ - Consumers: cs, - obsrecv: obsrecv, - telemetry: set.TelemetrySettings, - tracer: md.Tracer(set.TelemetrySettings), - authServer: authServer, - newConsumer: newConsumer, - gsettings: gsettings, - netReporter: netReporter, - telemetryBuilder: telemetryBuilder, - }, nil + + return recv, nil } // headerReceiver contains the state necessary to decode per-request metadata @@ -250,7 +290,7 @@ func (h *headerReceiver) newContext(ctx context.Context, hdrs map[string][]strin } // logStreamError decides how to log an error. -func (r *Receiver) logStreamError(err error) { +func (r *Receiver) logStreamError(err error, where string) { var code codes.Code var msg string // gRPC tends to supply status-wrapped errors, so we always @@ -270,9 +310,9 @@ func (r *Receiver) logStreamError(err error) { } if code == codes.Canceled { - r.telemetry.Logger.Debug("arrow stream shutdown", zap.String("message", msg)) + r.telemetry.Logger.Debug("arrow stream shutdown", zap.String("message", msg), zap.String("where", where)) } else { - r.telemetry.Logger.Error("arrow stream error", zap.String("message", msg), zap.Int("code", int(code))) + r.telemetry.Logger.Error("arrow stream error", zap.Int("code", int(code)), zap.String("message", msg), zap.String("where", where)) } } @@ -304,247 +344,555 @@ type anyStreamServer interface { grpc.ServerStream } +type batchResp struct { + id int64 + err error +} + +func (r *Receiver) recoverErr(retErr *error) { + if err := recover(); err != nil { + // When this happens, the stacktrace is + // important and lost if we don't capture it + // here. + r.telemetry.Logger.Error("panic detail in otel-arrow-adapter", + zap.Reflect("recovered", err), + zap.Stack("stacktrace"), + ) + *retErr = status.Errorf(codes.Internal, "panic in otel-arrow-adapter: %v", err) + } +} + func (r *Receiver) anyStream(serverStream anyStreamServer, method string) (retErr error) { streamCtx := serverStream.Context() ac := r.newConsumer() - hrcv := newHeaderReceiver(serverStream.Context(), r.authServer, r.gsettings.IncludeMetadata) defer func() { - if err := recover(); err != nil { - // When this happens, the stacktrace is - // important and lost if we don't capture it - // here. - r.telemetry.Logger.Debug("panic detail in otel-arrow-adapter", - zap.Reflect("recovered", err), - zap.Stack("stacktrace"), - ) - retErr = fmt.Errorf("panic in otel-arrow-adapter: %v", err) - } if err := ac.Close(); err != nil { r.telemetry.Logger.Error("arrow stream close", zap.Error(err)) } }() + defer r.recoverErr(&retErr) + + // doneCancel allows an error in the sender/receiver to + // interrupt the corresponding thread. + doneCtx, doneCancel := context.WithCancel(streamCtx) + defer doneCancel() + + // streamErrCh returns up to two errors from the sender and + // receiver threads started below. + streamErrCh := make(chan error, 2) + pendingCh := make(chan batchResp, runtime.NumCPU()) + + // wg is used to ensure this thread returns after both + // sender and recevier threads return. + var sendWG sync.WaitGroup + var recvWG sync.WaitGroup + sendWG.Add(1) + recvWG.Add(1) + + rstream := &receiverStream{ + Receiver: r, + } - for { - // Receive a batch corresponding with one ptrace.Traces, pmetric.Metrics, - // or plog.Logs item. - req, err := serverStream.Recv() - if err != nil { - // This includes the case where a client called CloseSend(), in - // which case we see an EOF error here. - r.logStreamError(err) - - if errors.Is(err, io.EOF) { - return status.Error(codes.Canceled, "client stream shutdown") - } else if errors.Is(err, context.Canceled) { - return status.Error(codes.Canceled, "server stream shutdown") - } - return err + go func() { + var err error + defer recvWG.Done() + defer r.recoverErr(&err) + err = rstream.srvReceiveLoop(doneCtx, serverStream, pendingCh, method, ac) + streamErrCh <- err + }() + + go func() { + var err error + defer sendWG.Done() + defer r.recoverErr(&err) + err = rstream.srvSendLoop(doneCtx, serverStream, &recvWG, pendingCh) + streamErrCh <- err + }() + + // Wait for sender/receiver threads to return before returning. + defer recvWG.Wait() + defer sendWG.Wait() + + select { + case <-doneCtx.Done(): + return status.Error(codes.Canceled, "server stream shutdown") + case retErr = <-streamErrCh: + doneCancel() + return + } +} + +func (r *receiverStream) newInFlightData(ctx context.Context, method string, batchID int64, pendingCh chan<- batchResp) (context.Context, *inFlightData) { + ctx, span := r.tracer.Start(ctx, "otel_arrow_stream_inflight") + + r.inFlightWG.Add(1) + r.recvInFlightRequests.Add(ctx, 1) + id := &inFlightData{ + receiverStream: r, + method: method, + batchID: batchID, + pendingCh: pendingCh, + span: span, + } + id.refs.Add(1) + return ctx, id +} + +// inFlightData is responsible for storing the resources held by one request. +type inFlightData struct { + // Receiver is the owner of the resources held by this object. + *receiverStream + + method string + batchID int64 + pendingCh chan<- batchResp + span trace.Span + + // refs counts the number of goroutines holding this object. + // initially the recvOne() body, on success the + // consumeAndRespond() function. + refs atomic.Int32 + + numAcquired int64 // how many bytes held in the semaphore + numItems int // how many items + uncompSize int64 // uncompressed data size +} + +func (id *inFlightData) recvDone(ctx context.Context, recvErrPtr *error) { + retErr := *recvErrPtr + + if retErr != nil { + // logStreamError because this response will break the stream. + id.logStreamError(retErr, "recv") + id.span.SetStatus(otelcodes.Error, retErr.Error()) + } + + id.anyDone(ctx) +} + +func (id *inFlightData) consumeDone(ctx context.Context, consumeErrPtr *error) { + retErr := *consumeErrPtr + + if retErr != nil { + // debug-level because the error was external from the pipeline. + id.telemetry.Logger.Debug("otel-arrow consume", zap.Error(retErr)) + id.span.SetStatus(otelcodes.Error, retErr.Error()) + } + + id.replyToCaller(retErr) + id.anyDone(ctx) +} + +func (id *inFlightData) replyToCaller(callerErr error) { + id.pendingCh <- batchResp{ + id: id.batchID, + err: callerErr, + } +} + +func (id *inFlightData) anyDone(ctx context.Context) { + // check if there are still refs, in which case leave the in-flight + // counts where they are. + if id.refs.Add(-1) != 0 { + return + } + + id.span.End() + + if id.numAcquired != 0 { + if err := id.boundedQueue.Release(id.numAcquired); err != nil { + id.telemetry.Logger.Error("release error", zap.Error(err)) } + } - // Check for optional headers and set the incoming context. - thisCtx, authHdrs, err := hrcv.combineHeaders(streamCtx, req.GetHeaders()) - if err != nil { - // Failing to parse the incoming headers breaks the stream. - r.telemetry.Logger.Error("arrow metadata error", zap.Error(err)) - return err + if id.uncompSize != 0 { + id.recvInFlightBytes.Add(ctx, -id.uncompSize) + } + if id.numItems != 0 { + id.recvInFlightItems.Add(ctx, int64(-id.numItems)) + } + + // The netstats code knows that uncompressed size is + // unreliable for arrow transport, so we instrument it + // directly here. Only the primary direction of transport + // is instrumented this way. + var sized netstats.SizesStruct + sized.Method = id.method + sized.Length = id.uncompSize + id.netReporter.CountReceive(ctx, sized) + + id.recvInFlightRequests.Add(ctx, -1) + id.inFlightWG.Done() +} + +// recvOne begins processing a single Arrow batch. +// +// If an error is encountered before Arrow data is successfully consumed, +// the stream will break and the error will be returned immediately. +// +// If the error is due to authorization, the stream remains unbroken +// and the request fails. +// +// If not enough resources are available, the stream will block (if +// waiting permitted) or break (insufficient waiters). +// +// Assuming success, a new goroutine is created to handle consuming the +// data. +// +// This handles constructing an inFlightData object, which itself +// tracks everything that needs to be used by instrumention when the +// batch finishes. +func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStreamServer, hrcv *headerReceiver, pendingCh chan<- batchResp, method string, ac arrowRecord.ConsumerAPI) (retErr error) { + + // Receive a batch corresponding with one ptrace.Traces, pmetric.Metrics, + // or plog.Logs item. + req, err := serverStream.Recv() + + // inflightCtx is carried through into consumeAndProcess on the success path. + inflightCtx, flight := r.newInFlightData(streamCtx, method, req.GetBatchId(), pendingCh) + defer flight.recvDone(inflightCtx, &retErr) + + if err != nil { + if errors.Is(err, io.EOF) { + return status.Error(codes.Canceled, "client stream shutdown") + } else if errors.Is(err, context.Canceled) { + return status.Error(codes.Canceled, "server stream shutdown") } + // Note: err is directly from gRPC, should already have status. + return err + } + // Check for optional headers and set the incoming context. + inflightCtx, authHdrs, err := hrcv.combineHeaders(inflightCtx, req.GetHeaders()) + if err != nil { + // Failing to parse the incoming headers breaks the stream. + return status.Errorf(codes.Internal, "arrow metadata error: %v", err) + } + + // start this span after hrcv.combineHeaders returns extracted context. This will allow this span + // to be a part of the data path trace instead of only being included as a child of the stream inflight trace. + inflightCtx, span := r.tracer.Start(inflightCtx, "otel_arrow_stream_recv") + defer span.End() + + // Authorize the request, if configured, prior to acquiring resources. + if r.authServer != nil { var authErr error - if r.authServer != nil { - var newCtx context.Context - if newCtx, err = r.authServer.Authenticate(thisCtx, authHdrs); err != nil { - authErr = err - } else { - thisCtx = newCtx - } + inflightCtx, authErr = r.authServer.Authenticate(inflightCtx, authHdrs) + if authErr != nil { + flight.replyToCaller(status.Error(codes.Unauthenticated, authErr.Error())) + return nil + } + } + + var prevAcquiredBytes int64 + uncompSizeHeaderStr, uncompSizeHeaderFound := authHdrs["otlp-pdata-size"] + if !uncompSizeHeaderFound || len(uncompSizeHeaderStr) == 0 { + // This is a compressed size so make sure to acquire the difference when request is decompressed. + prevAcquiredBytes = int64(proto.Size(req)) + } else { + prevAcquiredBytes, err = strconv.ParseInt(uncompSizeHeaderStr[0], 10, 64) + if err != nil { + return status.Errorf(codes.Internal, "failed to convert string to request size: %v", err) } + } - if err := r.processAndConsume(thisCtx, method, ac, req, serverStream, authErr); err != nil { - return err + // Use the bounded queue to memory limit based on incoming + // uncompressed request size and waiters. Acquire will fail + // immediately if there are too many waiters, or will + // otherwise block until timeout or enough memory becomes + // available. + err = r.boundedQueue.Acquire(inflightCtx, prevAcquiredBytes) + if err != nil { + return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue: %v", err) + } + flight.numAcquired = prevAcquiredBytes + + data, numItems, uncompSize, err := r.consumeBatch(ac, req) + + if err != nil { + if errors.Is(err, arrowRecord.ErrConsumerMemoryLimit) { + return status.Errorf(codes.ResourceExhausted, "otel-arrow decode: %v", err) } + return status.Errorf(codes.Internal, "otel-arrow decode: %v", err) + } + + flight.uncompSize = uncompSize + flight.numItems = numItems + + r.recvInFlightBytes.Add(inflightCtx, uncompSize) + r.recvInFlightItems.Add(inflightCtx, int64(numItems)) + + numAcquired, err := r.acquireAdditionalBytes(inflightCtx, prevAcquiredBytes, uncompSize, hrcv.connInfo.Addr, uncompSizeHeaderFound) + + flight.numAcquired = numAcquired + if err != nil { + return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue re-acquire: %v", err) } + + // Recognize that the request is still in-flight via consumeAndRespond() + flight.refs.Add(1) + + // consumeAndRespond consumes the data and returns control to the sender loop. + go r.consumeAndRespond(inflightCtx, data, flight) + + return nil } -func (r *Receiver) processAndConsume(ctx context.Context, method string, arrowConsumer arrowRecord.ConsumerAPI, req *arrowpb.BatchArrowRecords, serverStream anyStreamServer, authErr error) (retErr error) { +// consumeAndRespond finishes the span started in recvOne and logs the +// result after invoking the pipeline to consume the data. +func (r *Receiver) consumeAndRespond(ctx context.Context, data any, flight *inFlightData) { var err error + defer flight.consumeDone(ctx, &err) - ctx, span := r.tracer.Start(ctx, "otel_arrow_stream_recv") - defer span.End() + // recoverErr is a special function because it recovers panics, so we + // keep it in a separate defer than the processing above, which will + // run after the panic is recovered into an ordinary error. + defer r.recoverErr(&err) - r.telemetryBuilder.OtelArrowReceiverInFlightRequests.Add(ctx, 1) - defer func() { - r.telemetryBuilder.OtelArrowReceiverInFlightRequests.Add(ctx, -1) - // Set span status if an error is returned. - if retErr != nil { - span := trace.SpanFromContext(ctx) - span.SetStatus(otelcodes.Error, retErr.Error()) - } - }() + err = r.consumeData(ctx, data, flight) +} - // Process records: an error in this code path does - // not necessarily break the stream. - if authErr != nil { - err = authErr - } else { - err = r.processRecords(ctx, method, arrowConsumer, req) +// srvReceiveLoop repeatedly receives one batch of data. +func (r *receiverStream) srvReceiveLoop(ctx context.Context, serverStream anyStreamServer, pendingCh chan<- batchResp, method string, ac arrowRecord.ConsumerAPI) (retErr error) { + hrcv := newHeaderReceiver(ctx, r.authServer, r.gsettings.IncludeMetadata) + for { + select { + case <-ctx.Done(): + return status.Error(codes.Canceled, "server stream shutdown") + default: + if err := r.recvOne(ctx, serverStream, hrcv, pendingCh, method, ac); err != nil { + return err + } + } } +} +// srvReceiveLoop repeatedly sends one batch data response. +func (r *receiverStream) sendOne(serverStream anyStreamServer, resp batchResp) error { // Note: Statuses can be batched, but we do not take // advantage of this feature. - status := &arrowpb.BatchStatus{ - BatchId: req.GetBatchId(), + bs := &arrowpb.BatchStatus{ + BatchId: resp.id, } - if err == nil { - status.StatusCode = arrowpb.StatusCode_OK + if resp.err == nil { + bs.StatusCode = arrowpb.StatusCode_OK } else { - status.StatusMessage = err.Error() - switch { - case errors.Is(err, arrowRecord.ErrConsumerMemoryLimit): - r.telemetry.Logger.Error("arrow resource exhausted", zap.Error(err)) - status.StatusCode = arrowpb.StatusCode_RESOURCE_EXHAUSTED - case consumererror.IsPermanent(err): - r.telemetry.Logger.Error("arrow data error", zap.Error(err)) - status.StatusCode = arrowpb.StatusCode_INVALID_ARGUMENT - default: - r.telemetry.Logger.Debug("arrow consumer error", zap.Error(err)) - status.StatusCode = arrowpb.StatusCode_UNAVAILABLE + // Generally, code in the receiver should use + // status.Errorf(codes.XXX, ...) so that we take the + // first branch. + if gsc, ok := status.FromError(resp.err); ok { + bs.StatusCode = arrowpb.StatusCode(gsc.Code()) + bs.StatusMessage = gsc.Message() + } else { + // Ideally, we don't take this branch because all code uses + // gRPC status constructors and we've taken the branch above. + // + // This is a fallback for several broad categories of error. + bs.StatusMessage = resp.err.Error() + + switch { + case consumererror.IsPermanent(resp.err): + // Some kind of pipeline error, somewhere downstream. + r.telemetry.Logger.Error("arrow data error", zap.Error(resp.err)) + bs.StatusCode = arrowpb.StatusCode_INVALID_ARGUMENT + default: + // Probably a pipeline error, retryable. + r.telemetry.Logger.Debug("arrow consumer error", zap.Error(resp.err)) + bs.StatusCode = arrowpb.StatusCode_UNAVAILABLE + } } } - err = serverStream.Send(status) - if err != nil { - r.logStreamError(err) + if err := serverStream.Send(bs); err != nil { + // logStreamError because this response will break the stream. + r.logStreamError(err, "send") return err } + return nil } -// processRecords returns an error and a boolean indicating whether -// the error (true) was from processing the data (i.e., invalid -// argument) or (false) from the consuming pipeline. The boolean is -// not used when success (nil error) is returned. -func (r *Receiver) processRecords(ctx context.Context, method string, arrowConsumer arrowRecord.ConsumerAPI, records *arrowpb.BatchArrowRecords) error { +func (r *receiverStream) flushSender(serverStream anyStreamServer, recvWG *sync.WaitGroup, pendingCh <-chan batchResp) error { + // wait to ensure no more items are accepted + recvWG.Wait() + + // wait for all responses to be sent + r.inFlightWG.Wait() + + for { + select { + case resp := <-pendingCh: + if err := r.sendOne(serverStream, resp); err != nil { + return err + } + default: + // Currently nothing left in pendingCh. + return nil + } + } +} + +func (r *receiverStream) srvSendLoop(ctx context.Context, serverStream anyStreamServer, recvWG *sync.WaitGroup, pendingCh <-chan batchResp) error { + for { + select { + case <-ctx.Done(): + return r.flushSender(serverStream, recvWG, pendingCh) + case resp := <-pendingCh: + if err := r.sendOne(serverStream, resp); err != nil { + return err + } + } + } +} + +// consumeBatch applies the batch to the Arrow Consumer, returns a +// slice of pdata objects of the corresponding data type as `any`. +// along with the number of items and true uncompressed size. +func (r *Receiver) consumeBatch(arrowConsumer arrowRecord.ConsumerAPI, records *arrowpb.BatchArrowRecords) (retData any, numItems int, uncompSize int64, retErr error) { + payloads := records.GetArrowPayloads() if len(payloads) == 0 { - return nil - } - var uncompSize int64 - if r.telemetry.MetricsLevel > configtelemetry.LevelNormal { - defer func() { - // The netstats code knows that uncompressed size is - // unreliable for arrow transport, so we instrument it - // directly here. Only the primary direction of transport - // is instrumented this way. - var sized netstats.SizesStruct - sized.Method = method - if r.telemetry.MetricsLevel > configtelemetry.LevelNormal { - sized.Length = uncompSize - } - r.netReporter.CountReceive(ctx, sized) - r.netReporter.SetSpanSizeAttributes(ctx, sized) - }() + return nil, 0, 0, nil } switch payloads[0].Type { case arrowpb.ArrowPayloadType_UNIVARIATE_METRICS: if r.Metrics() == nil { - return status.Error(codes.Unimplemented, "metrics service not available") + return nil, 0, 0, status.Error(codes.Unimplemented, "metrics service not available") } var sizer pmetric.ProtoMarshaler - var numPts int - - ctx = r.obsrecv.StartMetricsOp(ctx) data, err := arrowConsumer.MetricsFrom(records) - if err != nil { - err = consumererror.NewPermanent(err) - } else { + if err == nil { for _, metrics := range data { - items := metrics.DataPointCount() - sz := int64(sizer.MetricsSize(metrics)) - - r.telemetryBuilder.OtelArrowReceiverInFlightBytes.Add(ctx, sz) - r.telemetryBuilder.OtelArrowReceiverInFlightItems.Add(ctx, int64(items)) - - numPts += items - uncompSize += sz - err = multierr.Append(err, - r.Metrics().ConsumeMetrics(ctx, metrics), - ) + numItems += metrics.DataPointCount() + uncompSize += int64(sizer.MetricsSize(metrics)) } - // entire request has been processed, decrement counter. - r.telemetryBuilder.OtelArrowReceiverInFlightBytes.Add(ctx, -uncompSize) - r.telemetryBuilder.OtelArrowReceiverInFlightItems.Add(ctx, int64(-numPts)) } - r.obsrecv.EndMetricsOp(ctx, streamFormat, numPts, err) - return err + retData = data + retErr = err case arrowpb.ArrowPayloadType_LOGS: if r.Logs() == nil { - return status.Error(codes.Unimplemented, "logs service not available") + return nil, 0, 0, status.Error(codes.Unimplemented, "logs service not available") } var sizer plog.ProtoMarshaler - var numLogs int - ctx = r.obsrecv.StartLogsOp(ctx) data, err := arrowConsumer.LogsFrom(records) - if err != nil { - err = consumererror.NewPermanent(err) - } else { + if err == nil { for _, logs := range data { - items := logs.LogRecordCount() - sz := int64(sizer.LogsSize(logs)) - - r.telemetryBuilder.OtelArrowReceiverInFlightBytes.Add(ctx, sz) - r.telemetryBuilder.OtelArrowReceiverInFlightItems.Add(ctx, int64(items)) - numLogs += items - uncompSize += sz - err = multierr.Append(err, - r.Logs().ConsumeLogs(ctx, logs), - ) + numItems += logs.LogRecordCount() + uncompSize += int64(sizer.LogsSize(logs)) } - // entire request has been processed, decrement counter. - r.telemetryBuilder.OtelArrowReceiverInFlightBytes.Add(ctx, -uncompSize) - r.telemetryBuilder.OtelArrowReceiverInFlightItems.Add(ctx, int64(-numLogs)) } - r.obsrecv.EndLogsOp(ctx, streamFormat, numLogs, err) - return err + retData = data + retErr = err case arrowpb.ArrowPayloadType_SPANS: if r.Traces() == nil { - return status.Error(codes.Unimplemented, "traces service not available") + return nil, 0, 0, status.Error(codes.Unimplemented, "traces service not available") } var sizer ptrace.ProtoMarshaler - var numSpans int - ctx = r.obsrecv.StartTracesOp(ctx) data, err := arrowConsumer.TracesFrom(records) - if err != nil { - err = consumererror.NewPermanent(err) - } else { + if err == nil { for _, traces := range data { - items := traces.SpanCount() - sz := int64(sizer.TracesSize(traces)) + numItems += traces.SpanCount() + uncompSize += int64(sizer.TracesSize(traces)) + } + } + retData = data + retErr = err - r.telemetryBuilder.OtelArrowReceiverInFlightBytes.Add(ctx, sz) - r.telemetryBuilder.OtelArrowReceiverInFlightItems.Add(ctx, int64(items)) + default: + retErr = ErrUnrecognizedPayload + } - numSpans += items - uncompSize += sz - err = multierr.Append(err, - r.Traces().ConsumeTraces(ctx, traces), - ) - } + return retData, numItems, uncompSize, retErr +} + +// consumeData invokes the next pipeline consumer for a received batch of data. +// it uses the standard OTel collector instrumentation (receiverhelper.ObsReport). +// +// if any errors are permanent, returns a permanent error. +func (r *Receiver) consumeData(ctx context.Context, data any, flight *inFlightData) (retErr error) { + oneOp := func(err error) { + retErr = multierr.Append(retErr, err) + } + var final func(context.Context, string, int, error) - // entire request has been processed, decrement counter. - r.telemetryBuilder.OtelArrowReceiverInFlightBytes.Add(ctx, -uncompSize) - r.telemetryBuilder.OtelArrowReceiverInFlightItems.Add(ctx, int64(-numSpans)) + switch items := data.(type) { + case []pmetric.Metrics: + ctx = r.obsrecv.StartMetricsOp(ctx) + for _, metrics := range items { + oneOp(r.Metrics().ConsumeMetrics(ctx, metrics)) } - r.obsrecv.EndTracesOp(ctx, streamFormat, numSpans, err) - return err + final = r.obsrecv.EndMetricsOp + + case []plog.Logs: + ctx = r.obsrecv.StartLogsOp(ctx) + for _, logs := range items { + oneOp(r.Logs().ConsumeLogs(ctx, logs)) + } + final = r.obsrecv.EndLogsOp + + case []ptrace.Traces: + ctx = r.obsrecv.StartTracesOp(ctx) + for _, traces := range items { + oneOp(r.Traces().ConsumeTraces(ctx, traces)) + } + final = r.obsrecv.EndTracesOp default: - return ErrUnrecognizedPayload + retErr = ErrUnrecognizedPayload + } + if final != nil { + final(ctx, streamFormat, flight.numItems, retErr) + } + return retErr +} + +func (r *Receiver) acquireAdditionalBytes(ctx context.Context, prevAcquired, uncompSize int64, addr net.Addr, uncompSizeHeaderFound bool) (int64, error) { + diff := uncompSize - prevAcquired + + if diff == 0 { + return uncompSize, nil + } + + if uncompSizeHeaderFound { + var clientAddr string + if addr != nil { + clientAddr = addr.String() + } + // a mismatch between header set by exporter and the uncompSize just calculated. + r.telemetry.Logger.Debug("mismatch between uncompressed size in receiver and otlp-pdata-size header", + zap.String("client-address", clientAddr), + zap.Int("uncompsize", int(uncompSize)), + zap.Int("otlp-pdata-size", int(prevAcquired)), + ) + } else if diff < 0 { + // proto.Size() on compressed request was greater than pdata uncompressed size. + r.telemetry.Logger.Debug("uncompressed size is less than compressed size", + zap.Int("uncompressed", int(uncompSize)), + zap.Int("compressed", int(prevAcquired)), + ) + } + + if diff < 0 { + // If the difference is negative, release the overage. + if err := r.boundedQueue.Release(-diff); err != nil { + return 0, err + } + } else { + // Release previously acquired bytes to prevent deadlock and + // reacquire the uncompressed size we just calculated. + if err := r.boundedQueue.Release(prevAcquired); err != nil { + return 0, err + } + if err := r.boundedQueue.Acquire(ctx, uncompSize); err != nil { + return 0, err + } } + return uncompSize, nil } diff --git a/receiver/otelarrowreceiver/internal/arrow/arrow_test.go b/receiver/otelarrowreceiver/internal/arrow/arrow_test.go index 62f40f976296..a11362789b01 100644 --- a/receiver/otelarrowreceiver/internal/arrow/arrow_test.go +++ b/receiver/otelarrowreceiver/internal/arrow/arrow_test.go @@ -9,12 +9,15 @@ import ( "encoding/json" "fmt" "io" + "strconv" "strings" "sync" "testing" + "time" arrowpb "github.com/open-telemetry/otel-arrow/api/experimental/arrow/v1" arrowCollectorMock "github.com/open-telemetry/otel-arrow/api/experimental/arrow/v1/mock" + "github.com/open-telemetry/otel-arrow/collector/admission" "github.com/open-telemetry/otel-arrow/collector/netstats" "github.com/open-telemetry/otel-arrow/collector/testdata" arrowRecord "github.com/open-telemetry/otel-arrow/pkg/otel/arrow_record" @@ -46,6 +49,10 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelarrowreceiver/internal/arrow/mock" ) +func defaultBQ() *admission.BoundedQueue { + return admission.NewBoundedQueue(int64(100000), int64(10)) +} + type compareJSONTraces struct{ ptrace.Traces } type compareJSONMetrics struct{ pmetric.Metrics } type compareJSONLogs struct{ plog.Logs } @@ -102,7 +109,7 @@ func (healthyTestChannel) onConsume() error { type unhealthyTestChannel struct{} func (unhealthyTestChannel) onConsume() error { - return fmt.Errorf("consumer unhealthy") + return status.Errorf(codes.Unavailable, "consumer unhealthy") } type recvResult struct { @@ -270,22 +277,6 @@ func statusUnavailableFor(batchID int64, msg string) *arrowpb.BatchStatus { } } -func statusInvalidFor(batchID int64, msg string) *arrowpb.BatchStatus { - return &arrowpb.BatchStatus{ - BatchId: batchID, - StatusCode: arrowpb.StatusCode_INVALID_ARGUMENT, - StatusMessage: msg, - } -} - -func statusExhaustedFor(batchID int64, msg string) *arrowpb.BatchStatus { - return &arrowpb.BatchStatus{ - BatchId: batchID, - StatusCode: arrowpb.StatusCode_RESOURCE_EXHAUSTED, - StatusMessage: msg, - } -} - func (ctc *commonTestCase) newRealConsumer() arrowRecord.ConsumerAPI { mock := arrowRecordMock.NewMockConsumerAPI(ctc.ctrl) cons := arrowRecord.NewConsumer() @@ -320,7 +311,7 @@ func (ctc *commonTestCase) newOOMConsumer() arrowRecord.ConsumerAPI { return mock } -func (ctc *commonTestCase) start(newConsumer func() arrowRecord.ConsumerAPI, opts ...func(*configgrpc.ServerConfig, *auth.Server)) { +func (ctc *commonTestCase) start(newConsumer func() arrowRecord.ConsumerAPI, bq *admission.BoundedQueue, opts ...func(*configgrpc.ServerConfig, *auth.Server)) { var authServer auth.Server var gsettings configgrpc.ServerConfig for _, gf := range opts { @@ -344,6 +335,7 @@ func (ctc *commonTestCase) start(newConsumer func() arrowRecord.ConsumerAPI, opt gsettings, authServer, newConsumer, + bq, netstats.Noop{}, ) require.NoError(ctc.T, err) @@ -353,13 +345,129 @@ func (ctc *commonTestCase) start(newConsumer func() arrowRecord.ConsumerAPI, opt } func requireCanceledStatus(t *testing.T, err error) { + requireStatus(t, codes.Canceled, err) +} + +func requireUnavailableStatus(t *testing.T, err error) { + requireStatus(t, codes.Unavailable, err) +} + +func requireInternalStatus(t *testing.T, err error) { + requireStatus(t, codes.Internal, err) +} + +func requireExhaustedStatus(t *testing.T, err error) { + requireStatus(t, codes.ResourceExhausted, err) +} + +func requireStatus(t *testing.T, code codes.Code, err error) { require.Error(t, err) status, ok := status.FromError(err) require.True(t, ok, "is status-wrapped %v", err) - require.Equal(t, codes.Canceled, status.Code()) + require.Equal(t, code, status.Code()) +} + +func TestBoundedQueueWithPdataHeaders(t *testing.T) { + var sizer ptrace.ProtoMarshaler + stdTesting := otelAssert.NewStdUnitTest(t) + pdataSizeTenTraces := sizer.TracesSize(testdata.GenerateTraces(10)) + defaultBoundedQueueLimit := int64(100000) + tests := []struct { + name string + numTraces int + includePdataHeader bool + pdataSize string + rejected bool + }{ + { + name: "no header compressed greater than uncompressed", + numTraces: 10, + }, + { + name: "no header compressed less than uncompressed", + numTraces: 100, + }, + { + name: "pdata header less than uncompressedSize", + numTraces: 10, + pdataSize: strconv.Itoa(pdataSizeTenTraces / 2), + includePdataHeader: true, + }, + { + name: "pdata header equal uncompressedSize", + numTraces: 10, + pdataSize: strconv.Itoa(pdataSizeTenTraces), + includePdataHeader: true, + }, + { + name: "pdata header greater than uncompressedSize", + numTraces: 10, + pdataSize: strconv.Itoa(pdataSizeTenTraces * 2), + includePdataHeader: true, + }, + { + name: "no header compressed accepted uncompressed rejected", + numTraces: 100, + rejected: true, + }, + { + name: "pdata header accepted uncompressed rejected", + numTraces: 100, + rejected: true, + pdataSize: strconv.Itoa(pdataSizeTenTraces), + includePdataHeader: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tc := healthyTestChannel{} + ctc := newCommonTestCase(t, tc) + + td := testdata.GenerateTraces(tt.numTraces) + batch, err := ctc.testProducer.BatchArrowRecordsFromTraces(td) + require.NoError(t, err) + if tt.includePdataHeader { + var hpb bytes.Buffer + hpe := hpack.NewEncoder(&hpb) + err = hpe.WriteField(hpack.HeaderField{ + Name: "otlp-pdata-size", + Value: tt.pdataSize, + }) + assert.NoError(t, err) + batch.Headers = make([]byte, hpb.Len()) + copy(batch.Headers, hpb.Bytes()) + } + + var bq *admission.BoundedQueue + if tt.rejected { + ctc.stream.EXPECT().Send(statusOKFor(batch.BatchId)).Times(0) + bq = admission.NewBoundedQueue(int64(sizer.TracesSize(td)-100), 10) + } else { + ctc.stream.EXPECT().Send(statusOKFor(batch.BatchId)).Times(1).Return(nil) + bq = admission.NewBoundedQueue(defaultBoundedQueueLimit, 10) + } + + ctc.start(ctc.newRealConsumer, bq) + ctc.putBatch(batch, nil) + + if tt.rejected { + requireExhaustedStatus(t, ctc.wait()) + } else { + data := <-ctc.consume + actualTD := data.Data.(ptrace.Traces) + otelAssert.Equiv(stdTesting, []json.Marshaler{ + compareJSONTraces{td}, + }, []json.Marshaler{ + compareJSONTraces{actualTD}, + }) + requireCanceledStatus(t, ctc.cancelAndWait()) + } + }) + } } func TestReceiverTraces(t *testing.T) { + stdTesting := otelAssert.NewStdUnitTest(t) tc := healthyTestChannel{} ctc := newCommonTestCase(t, tc) @@ -369,10 +477,14 @@ func TestReceiverTraces(t *testing.T) { ctc.stream.EXPECT().Send(statusOKFor(batch.BatchId)).Times(1).Return(nil) - ctc.start(ctc.newRealConsumer) + ctc.start(ctc.newRealConsumer, defaultBQ()) ctc.putBatch(batch, nil) - assert.EqualValues(t, td, (<-ctc.consume).Data) + otelAssert.Equiv(stdTesting, []json.Marshaler{ + compareJSONTraces{td}, + }, []json.Marshaler{ + compareJSONTraces{(<-ctc.consume).Data.(ptrace.Traces)}, + }) err = ctc.cancelAndWait() requireCanceledStatus(t, err) @@ -388,7 +500,7 @@ func TestReceiverLogs(t *testing.T) { ctc.stream.EXPECT().Send(statusOKFor(batch.BatchId)).Times(1).Return(nil) - ctc.start(ctc.newRealConsumer) + ctc.start(ctc.newRealConsumer, defaultBQ()) ctc.putBatch(batch, nil) assert.EqualValues(t, []json.Marshaler{compareJSONLogs{ld}}, []json.Marshaler{compareJSONLogs{(<-ctc.consume).Data.(plog.Logs)}}) @@ -408,7 +520,7 @@ func TestReceiverMetrics(t *testing.T) { ctc.stream.EXPECT().Send(statusOKFor(batch.BatchId)).Times(1).Return(nil) - ctc.start(ctc.newRealConsumer) + ctc.start(ctc.newRealConsumer, defaultBQ()) ctc.putBatch(batch, nil) otelAssert.Equiv(stdTesting, []json.Marshaler{ @@ -425,7 +537,7 @@ func TestReceiverRecvError(t *testing.T) { tc := healthyTestChannel{} ctc := newCommonTestCase(t, tc) - ctc.start(ctc.newRealConsumer) + ctc.start(ctc.newRealConsumer, defaultBQ()) ctc.putBatch(nil, fmt.Errorf("test recv error")) @@ -442,16 +554,27 @@ func TestReceiverSendError(t *testing.T) { batch, err := ctc.testProducer.BatchArrowRecordsFromLogs(ld) require.NoError(t, err) - ctc.stream.EXPECT().Send(statusOKFor(batch.BatchId)).Times(1).Return(fmt.Errorf("test send error")) + ctc.stream.EXPECT().Send(statusOKFor(batch.BatchId)).Times(1).Return(status.Errorf(codes.Unavailable, "test send error")) - ctc.start(ctc.newRealConsumer) + ctc.start(ctc.newRealConsumer, defaultBQ()) ctc.putBatch(batch, nil) assert.EqualValues(t, ld, (<-ctc.consume).Data) + start := time.Now() + for time.Since(start) < 10*time.Second { + if ctc.ctrl.Satisfied() { + break + } + time.Sleep(time.Second) + } + + // Release the receiver -- the sender has seen an error by + // now and should return the stream. (Oddly, gRPC has no way + // to signal the receive call to fail using context.) + close(ctc.receive) err = ctc.wait() - require.Error(t, err) - require.Contains(t, err.Error(), "test send error") + requireUnavailableStatus(t, err) } func TestReceiverConsumeError(t *testing.T) { @@ -485,7 +608,7 @@ func TestReceiverConsumeError(t *testing.T) { ctc.stream.EXPECT().Send(statusUnavailableFor(batch.BatchId, "consumer unhealthy")).Times(1).Return(nil) - ctc.start(ctc.newRealConsumer) + ctc.start(ctc.newRealConsumer, defaultBQ()) ctc.putBatch(batch, nil) @@ -523,7 +646,7 @@ func TestReceiverInvalidData(t *testing.T) { } for _, item := range data { - tc := unhealthyTestChannel{} + tc := healthyTestChannel{} ctc := newCommonTestCase(t, tc) var batch *arrowpb.BatchArrowRecords @@ -542,13 +665,12 @@ func TestReceiverInvalidData(t *testing.T) { batch = copyBatch(batch) - ctc.stream.EXPECT().Send(statusInvalidFor(batch.BatchId, "Permanent error: test invalid error")).Times(1).Return(nil) - - ctc.start(ctc.newErrorConsumer) + // newErrorConsumer determines the internal error in decoding above + ctc.start(ctc.newErrorConsumer, defaultBQ()) ctc.putBatch(batch, nil) - err = ctc.cancelAndWait() - requireCanceledStatus(t, err) + err = ctc.wait() + requireInternalStatus(t, err) } } @@ -579,13 +701,13 @@ func TestReceiverMemoryLimit(t *testing.T) { batch = copyBatch(batch) - ctc.stream.EXPECT().Send(statusExhaustedFor(batch.BatchId, "Permanent error: test oom error "+arrowRecord.ErrConsumerMemoryLimit.Error())).Times(1).Return(nil) + // The Recv() returns an error, there are no Send() calls. - ctc.start(ctc.newOOMConsumer) + ctc.start(ctc.newOOMConsumer, defaultBQ()) ctc.putBatch(batch, nil) - err = ctc.cancelAndWait() - requireCanceledStatus(t, err) + err = ctc.wait() + requireExhaustedStatus(t, err) } } @@ -618,6 +740,7 @@ func copyBatch(in *arrowpb.BatchArrowRecords) *arrowpb.BatchArrowRecords { func TestReceiverEOF(t *testing.T) { tc := healthyTestChannel{} ctc := newCommonTestCase(t, tc) + stdTesting := otelAssert.NewStdUnitTest(t) // send a sequence of data then simulate closing the connection. const times = 10 @@ -627,7 +750,7 @@ func TestReceiverEOF(t *testing.T) { ctc.stream.EXPECT().Send(gomock.Any()).Times(times).Return(nil) - ctc.start(ctc.newRealConsumer) + ctc.start(ctc.newRealConsumer, defaultBQ()) go func() { for i := 0; i < times; i++ { @@ -658,7 +781,15 @@ func TestReceiverEOF(t *testing.T) { actualData = append(actualData, (<-ctc.consume).Data.(ptrace.Traces)) } - assert.EqualValues(t, expectData, actualData) + assert.Equal(t, len(expectData), len(actualData)) + + for i := 0; i < len(expectData); i++ { + otelAssert.Equiv(stdTesting, []json.Marshaler{ + compareJSONTraces{expectData[i]}, + }, []json.Marshaler{ + compareJSONTraces{actualData[i]}, + }) + } wg.Wait() } @@ -684,7 +815,7 @@ func testReceiverHeaders(t *testing.T, includeMeta bool) { ctc.stream.EXPECT().Send(gomock.Any()).Times(len(expectData)).Return(nil) - ctc.start(ctc.newRealConsumer, func(gsettings *configgrpc.ServerConfig, _ *auth.Server) { + ctc.start(ctc.newRealConsumer, defaultBQ(), func(gsettings *configgrpc.ServerConfig, _ *auth.Server) { gsettings.IncludeMetadata = includeMeta }) @@ -756,7 +887,7 @@ func TestReceiverCancel(t *testing.T) { ctc := newCommonTestCase(t, tc) ctc.cancel() - ctc.start(ctc.newRealConsumer) + ctc.start(ctc.newRealConsumer, defaultBQ()) err := ctc.wait() requireCanceledStatus(t, err) @@ -1046,7 +1177,7 @@ func testReceiverAuthHeaders(t *testing.T, includeMeta bool, dataAuth bool) { }) var authCall *gomock.Call - ctc.start(ctc.newRealConsumer, func(gsettings *configgrpc.ServerConfig, authPtr *auth.Server) { + ctc.start(ctc.newRealConsumer, defaultBQ(), func(gsettings *configgrpc.ServerConfig, authPtr *auth.Server) { gsettings.IncludeMetadata = includeMeta as := mock.NewMockServer(ctc.ctrl) diff --git a/receiver/otelarrowreceiver/otelarrow.go b/receiver/otelarrowreceiver/otelarrow.go index 84a4ebb6487e..9a1ef1cbd4f7 100644 --- a/receiver/otelarrowreceiver/otelarrow.go +++ b/receiver/otelarrowreceiver/otelarrow.go @@ -9,6 +9,7 @@ import ( "sync" arrowpb "github.com/open-telemetry/otel-arrow/api/experimental/arrow/v1" + "github.com/open-telemetry/otel-arrow/collector/admission" "github.com/open-telemetry/otel-arrow/collector/compression/zstd" "github.com/open-telemetry/otel-arrow/collector/netstats" arrowRecord "github.com/open-telemetry/otel-arrow/pkg/otel/arrow_record" @@ -94,25 +95,26 @@ func (r *otelArrowReceiver) startGRPCServer(cfg configgrpc.ServerConfig, _ compo return nil } -func (r *otelArrowReceiver) startProtocolServers(host component.Host) error { +func (r *otelArrowReceiver) startProtocolServers(ctx context.Context, host component.Host) error { var err error var serverOpts []grpc.ServerOption if r.netReporter != nil { serverOpts = append(serverOpts, grpc.StatsHandler(r.netReporter.Handler())) } - r.serverGRPC, err = r.cfg.GRPC.ToServer(context.Background(), host, r.settings.TelemetrySettings, serverOpts...) + r.serverGRPC, err = r.cfg.GRPC.ToServer(ctx, host, r.settings.TelemetrySettings, serverOpts...) if err != nil { return err } var authServer auth.Server if r.cfg.GRPC.Auth != nil { - authServer, err = r.cfg.GRPC.Auth.GetServerAuthenticatorContext(context.Background(), host.GetExtensions()) + authServer, err = r.cfg.GRPC.Auth.GetServerAuthenticatorContext(ctx, host.GetExtensions()) if err != nil { return err } } + bq := admission.NewBoundedQueue(int64(r.cfg.Arrow.AdmissionLimitMiB<<20), r.cfg.Arrow.WaiterLimit) r.arrowReceiver, err = arrow.New(arrow.Consumers(r), r.settings, r.obsrepGRPC, r.cfg.GRPC, authServer, func() arrowRecord.ConsumerAPI { var opts []arrowRecord.Option @@ -124,7 +126,7 @@ func (r *otelArrowReceiver) startProtocolServers(host component.Host) error { opts = append(opts, arrowRecord.WithMeterProvider(r.settings.TelemetrySettings.MeterProvider, r.settings.TelemetrySettings.MetricsLevel)) } return arrowRecord.NewConsumer(opts...) - }, r.netReporter) + }, bq, r.netReporter) if err != nil { return err @@ -158,8 +160,8 @@ func (r *otelArrowReceiver) startProtocolServers(host component.Host) error { // Start runs the trace receiver on the gRPC server. Currently // it also enables the metrics receiver too. -func (r *otelArrowReceiver) Start(_ context.Context, host component.Host) error { - return r.startProtocolServers(host) +func (r *otelArrowReceiver) Start(ctx context.Context, host component.Host) error { + return r.startProtocolServers(ctx, host) } // Shutdown is a method to turn off receiving. diff --git a/receiver/otelarrowreceiver/otelarrow_test.go b/receiver/otelarrowreceiver/otelarrow_test.go index 38e2c5c643a3..6d2130ca4123 100644 --- a/receiver/otelarrowreceiver/otelarrow_test.go +++ b/receiver/otelarrowreceiver/otelarrow_test.go @@ -9,6 +9,7 @@ import ( "errors" "fmt" "net" + "strconv" "sync" "testing" "time" @@ -65,13 +66,13 @@ func TestGRPCNewPortAlreadyUsed(t *testing.T) { require.Error(t, r.Start(context.Background(), componenttest.NewNopHost())) } -// TestOTLPReceiverGRPCTracesIngestTest checks that the gRPC trace receiver +// TestOTelArrowReceiverGRPCTracesIngestTest checks that the gRPC trace receiver // is returning the proper response (return and metrics) when the next consumer // in the pipeline reports error. The test changes the responses returned by the // next trace consumer, checks if data was passed down the pipeline and if // proper metrics were recorded. It also uses all endpoints supported by the // trace receiver. -func TestOTLPReceiverGRPCTracesIngestTest(t *testing.T) { +func TestOTelArrowReceiverGRPCTracesIngestTest(t *testing.T) { type ingestionStateTest struct { okToIngest bool expectedCode codes.Code @@ -236,7 +237,7 @@ func TestShutdown(t *testing.T) { nextSink := new(consumertest.TracesSink) - // Create OTLP receiver + // Create OTelArrow receiver factory := NewFactory() cfg := factory.CreateDefaultConfig().(*Config) cfg.GRPC.NetAddr.Endpoint = endpointGrpc @@ -380,15 +381,25 @@ func (esc *errOrSinkConsumer) Reset() { type tracesSinkWithMetadata struct { consumertest.TracesSink - MDs []client.Metadata + + lock sync.Mutex + mds []client.Metadata } func (ts *tracesSinkWithMetadata) ConsumeTraces(ctx context.Context, td ptrace.Traces) error { info := client.FromContext(ctx) - ts.MDs = append(ts.MDs, info.Metadata) + ts.lock.Lock() + defer ts.lock.Unlock() + ts.mds = append(ts.mds, info.Metadata) return ts.TracesSink.ConsumeTraces(ctx, td) } +func (ts *tracesSinkWithMetadata) Metadatas() []client.Metadata { + ts.lock.Lock() + defer ts.lock.Unlock() + return ts.mds +} + type anyStreamClient interface { Send(*arrowpb.BatchArrowRecords) error Recv() (*arrowpb.BatchStatus, error) @@ -470,12 +481,12 @@ func TestGRPCArrowReceiver(t *testing.T) { assert.Equal(t, expectTraces, sink.AllTraces()) - assert.Equal(t, len(expectMDs), len(sink.MDs)) + assert.Equal(t, len(expectMDs), len(sink.Metadatas())) // gRPC adds its own metadata keys, so we check for only the // expected ones below: for idx := range expectMDs { for key, vals := range expectMDs[idx] { - require.Equal(t, vals, sink.MDs[idx].Get(key), "for key %s", key) + require.Equal(t, vals, sink.Metadatas()[idx].Get(key), "for key %s", key) } } } @@ -565,8 +576,8 @@ func TestGRPCArrowReceiverAuth(t *testing.T) { // The stream has to be successful to get this far. The // authenticator fails every data item: require.Equal(t, batch.BatchId, resp.BatchId) - require.Equal(t, arrowpb.StatusCode_UNAVAILABLE, resp.StatusCode) - require.Equal(t, errorString, resp.StatusMessage) + require.Equal(t, arrowpb.StatusCode_UNAUTHENTICATED, resp.StatusCode) + require.Contains(t, resp.StatusMessage, errorString) } assert.NoError(t, cc.Close()) @@ -574,3 +585,92 @@ func TestGRPCArrowReceiverAuth(t *testing.T) { assert.Equal(t, 0, len(sink.AllTraces())) } + +func TestConcurrentArrowReceiver(t *testing.T) { + addr := testutil.GetAvailableLocalAddress(t) + sink := new(tracesSinkWithMetadata) + + factory := NewFactory() + cfg := factory.CreateDefaultConfig().(*Config) + cfg.GRPC.NetAddr.Endpoint = addr + cfg.GRPC.IncludeMetadata = true + id := component.NewID(component.MustNewType("arrow")) + tt := componenttest.NewNopTelemetrySettings() + ocr := newReceiver(t, factory, tt, cfg, id, sink, nil) + + require.NotNil(t, ocr) + require.NoError(t, ocr.Start(context.Background(), componenttest.NewNopHost())) + + cc, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials())) + require.NoError(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + const itemsPerStream = 10 + const numStreams = 5 + + var wg sync.WaitGroup + wg.Add(numStreams) + + for j := 0; j < numStreams; j++ { + go func() { + defer wg.Done() + + client := arrowpb.NewArrowTracesServiceClient(cc) + stream, err := client.ArrowTraces(ctx, grpc.WaitForReady(true)) + require.NoError(t, err) + producer := arrowRecord.NewProducer() + + var headerBuf bytes.Buffer + hpd := hpack.NewEncoder(&headerBuf) + + // Repeatedly send traces via arrow. Set the expected traces + // metadata to receive. + for i := 0; i < itemsPerStream; i++ { + td := testdata.GenerateTraces(2) + + headerBuf.Reset() + err := hpd.WriteField(hpack.HeaderField{ + Name: "seq", + Value: fmt.Sprint(i), + }) + require.NoError(t, err) + + batch, err := producer.BatchArrowRecordsFromTraces(td) + require.NoError(t, err) + + batch.Headers = headerBuf.Bytes() + + err = stream.Send(batch) + + require.NoError(t, err) + + resp, err := stream.Recv() + require.NoError(t, err) + require.Equal(t, batch.BatchId, resp.BatchId) + require.Equal(t, arrowpb.StatusCode_OK, resp.StatusCode) + } + }() + } + wg.Wait() + + assert.NoError(t, cc.Close()) + require.NoError(t, ocr.Shutdown(context.Background())) + + counts := make([]int, itemsPerStream) + + // Two spans per stream/item. + require.Equal(t, itemsPerStream*numStreams*2, sink.SpanCount()) + require.Equal(t, itemsPerStream*numStreams, len(sink.Metadatas())) + + for _, md := range sink.Metadatas() { + val, err := strconv.Atoi(md.Get("seq")[0]) + require.NoError(t, err) + counts[val]++ + } + + for i := 0; i < itemsPerStream; i++ { + require.Equal(t, numStreams, counts[i]) + } +} diff --git a/receiver/otelarrowreceiver/testdata/config.yaml b/receiver/otelarrowreceiver/testdata/config.yaml index 0db443736428..726263f82f9f 100644 --- a/receiver/otelarrowreceiver/testdata/config.yaml +++ b/receiver/otelarrowreceiver/testdata/config.yaml @@ -27,3 +27,5 @@ protocols: permit_without_stream: true arrow: memory_limit_mib: 123 + admission_limit_mib: 80 + waiter_limit: 100 From 69e914151e74435361a4ec0ebe3a01043c96a5b6 Mon Sep 17 00:00:00 2001 From: Michal Pristas Date: Fri, 14 Jun 2024 04:33:05 +0200 Subject: [PATCH 007/138] [pkg/ottl] Added 'Adding new functions/converters' guidelines (#33117) **Description:** As discussed [here](https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/32433#issuecomment-2117931670) we should agree on some guidelines when it comes to creating proposals for new functions and converters cc @TylerHelmuth **Link to tracking Issue:** - **Testing:** - **Documentation:** - --------- Co-authored-by: Evan Bradley <11745660+evan-bradley@users.noreply.github.com> Co-authored-by: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com> Co-authored-by: Daniel Jaglowski --- pkg/ottl/ottlfuncs/README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pkg/ottl/ottlfuncs/README.md b/pkg/ottl/ottlfuncs/README.md index cf39931f489e..993f9d34ce2f 100644 --- a/pkg/ottl/ottlfuncs/README.md +++ b/pkg/ottl/ottlfuncs/README.md @@ -1400,3 +1400,24 @@ Functions should be named and formatted according to the following standards. - Functions that interact with multiple items MUST have plurality in the name. Ex: `truncate_all`, `keep_keys`, `replace_all_matches`. - Functions that interact with a single item MUST NOT have plurality in the name. If a function would interact with multiple items due to a condition, like `where`, it is still considered singular. Ex: `set`, `delete`, `replace_match`. - Functions that change a specific target MUST set the target as the first parameter. + +## Adding New Editors/Converters + +Before raising a PR with a new Editor or Converter, raise an issue to verify its acceptance. While acceptance is strongly specific to a specific use case, consider these guidelines for early assessment. + +Your proposal likely will be accepted if: +- The proposed functionality is missing, +- The proposed solution significantly improves user experience and readability for very common use cases, +- The proposed solution is more performant in cases where it is possible to achieve the same result with existing options. + +It will be up for discussion if your proposal solves an issue that can be achieved in another way but does not improve user experience or performance. + +Your proposal likely won't be accepted if: +- User experience is worse and assumes a highly technical user, +- The performance of your proposal very negatively affects the processing pipeline. + +As with code, OTTL aims for readability first. This means: +- Using short, meaningful, and descriptive names, +- Ensuring naming consistency across Editors and Converters, +- Avoiding deep nesting to achieve desired transformations, +- Ensuring Editors and Converters have a single responsibility. From 01efffd2847aaa4c162103a1ac361b9b5a4516fc Mon Sep 17 00:00:00 2001 From: Florian Bacher Date: Fri, 14 Jun 2024 15:00:47 +0200 Subject: [PATCH 008/138] [pkg/stanza] support gzip compressed log files for file log receiver (#33406) **Description:** This PR adds support for reading gzip compressed log files for the file log receiver. This is done by, if enabled via the `gzip_file_suffix` parameter, creating a `gzip.Reader` on top of the file handle of a compressed file. **Link to tracking Issue:** #2328 **Testing:** Added unit tests for the new functionality. Manually tested using the following configuration for the filelog receiver: ``` filelog: include: [ ./simple.log*.gz ] start_at: beginning gzip_file_suffix: ".gz" operators: - type: regex_parser regex: '^(?P