From e2d5dee2f6c27cb89c69ab8aa90ad0faff222d65 Mon Sep 17 00:00:00 2001
From: Antoine Toulme <antoine@lunar-ocean.com>
Date: Thu, 7 Apr 2022 23:55:43 -0700
Subject: [PATCH 1/5] Add ability to sample logs

---
 CHANGELOG.md                                  |   1 +
 .../probabilisticsamplerprocessor/README.md   |  36 +++-
 .../probabilisticsamplerprocessor/config.go   |  56 ++++++-
 .../config_test.go                            |  23 ++-
 .../probabilisticsamplerprocessor/factory.go  |  13 +-
 .../factory_test.go                           |   8 +
 .../logprobabilisticsampler.go                |  95 +++++++++++
 .../logprobabilisticsampler_test.go           | 155 ++++++++++++++++++
 .../testdata/config.yaml                      |  29 ++++
 .../testdata/invalid.yaml                     |  24 +++
 10 files changed, 433 insertions(+), 7 deletions(-)
 create mode 100644 processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
 create mode 100644 processor/probabilisticsamplerprocessor/logprobabilisticsampler_test.go
 create mode 100644 processor/probabilisticsamplerprocessor/testdata/invalid.yaml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3f1e29b1ea7c..6fdba5d1e48e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@
 - `prometheusremotewriteexporter`: Translate resource attributes to the target info metric (#8493)
 - `podmanreceiver`: Add API timeout configuration option (#9014)
 - `cmd/mdatagen`: Add `sem_conv_version` field to metadata.yaml that is used to set metrics SchemaURL (#9010)
+- `probabilistic_sampler`: Add ability to sample logs (#9118)
 
 ### 🛑 Breaking changes 🛑
 
diff --git a/processor/probabilisticsamplerprocessor/README.md b/processor/probabilisticsamplerprocessor/README.md
index 31e593a1fb74..8f046af95e7b 100644
--- a/processor/probabilisticsamplerprocessor/README.md
+++ b/processor/probabilisticsamplerprocessor/README.md
@@ -1,8 +1,39 @@
 # Probabilistic Sampling Processor
 
-Supported pipeline types: traces
+Supported pipeline types: traces, logs
 
-The probabilistic sampler supports two types of sampling:
+The probabilistic sampler supports sampling logs by associating a sampling rate to log severity.
+
+A default sampling rate is mandatory. Additionally, additional sampling rates associated with a log severity can be added.
+Any message with a log severity equal or higher to the log severity will adopt the new sampling.
+
+The probabilistic sampler optionally may use a `hash_seed` to compute the hash of a log record.
+This sampler samples based on hash values determined by log records. In order for
+log record hashing to work, all collectors for a given tier (e.g. behind the same load balancer)
+must have the same `hash_seed`. It is also possible to leverage a different `hash_seed` at
+different collector tiers to support additional sampling requirements. Please refer to
+[config.go](./config.go) for the config spec.
+
+The following configuration options can be modified:
+- `hash_seed` (no default): An integer used to compute the hash algorithm. Note that all collectors for a given tier (e.g. behind the same load balancer) should have the same hash_seed.
+- `sampling_percentage` (default = 0): Percentage at which logs are sampled; >= 100 samples all logs
+- `severity/severity_level`: `SeverityText` associated with a [severity level](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/logs/data-model.md#displaying-severity)
+- `severity/sampling_percentage` (default = 0): Percentage at which logs are sampled when the severity is equal or higher to the severity text; >= 100 samples all logs
+Examples:
+
+```yaml
+processors:
+  probabilistic_sampler:
+    hash_seed: 22
+    sampling_percentage: 15
+    severity:
+      - severity_level: error
+        sampling_percentage: 100
+      - severity_level: warn
+        sampling_percentage: 75
+```
+
+The probabilistic sampler supports two types of sampling for traces:
 
 1. `sampling.priority` [semantic
 convention](https://github.com/opentracing/specification/blob/master/semantic_conventions.md#span-tags-table)
@@ -31,3 +62,4 @@ processors:
 
 Refer to [config.yaml](./testdata/config.yaml) for detailed
 examples on using the processor.
+
diff --git a/processor/probabilisticsamplerprocessor/config.go b/processor/probabilisticsamplerprocessor/config.go
index d7dd5dada8c0..226818ee8eb1 100644
--- a/processor/probabilisticsamplerprocessor/config.go
+++ b/processor/probabilisticsamplerprocessor/config.go
@@ -15,26 +15,76 @@
 package probabilisticsamplerprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/probabilisticsamplerprocessor"
 
 import (
+	"fmt"
+
 	"go.opentelemetry.io/collector/config"
+	"go.opentelemetry.io/collector/model/pdata"
 )
 
-// Config has the configuration guiding the trace sampler processor.
+var severityTextToNum = map[string]pdata.SeverityNumber{
+	"default": pdata.SeverityNumberUNDEFINED,
+	"trace":   pdata.SeverityNumberTRACE,
+	"trace2":  pdata.SeverityNumberTRACE2,
+	"trace3":  pdata.SeverityNumberTRACE3,
+	"trace4":  pdata.SeverityNumberTRACE4,
+	"debug":   pdata.SeverityNumberDEBUG,
+	"debug2":  pdata.SeverityNumberDEBUG2,
+	"debug3":  pdata.SeverityNumberDEBUG3,
+	"debug4":  pdata.SeverityNumberDEBUG4,
+	"info":    pdata.SeverityNumberINFO,
+	"info2":   pdata.SeverityNumberINFO2,
+	"info3":   pdata.SeverityNumberINFO3,
+	"info4":   pdata.SeverityNumberINFO4,
+	"warn":    pdata.SeverityNumberWARN,
+	"warn2":   pdata.SeverityNumberWARN2,
+	"warn3":   pdata.SeverityNumberWARN3,
+	"warn4":   pdata.SeverityNumberWARN4,
+	"error":   pdata.SeverityNumberERROR,
+	"error2":  pdata.SeverityNumberERROR2,
+	"error3":  pdata.SeverityNumberERROR3,
+	"error4":  pdata.SeverityNumberERROR4,
+	"fatal":   pdata.SeverityNumberFATAL,
+	"fatal2":  pdata.SeverityNumberFATAL2,
+	"fatal3":  pdata.SeverityNumberFATAL3,
+	"fatal4":  pdata.SeverityNumberFATAL4,
+}
+
+type severityPair struct {
+	Level              string  `mapstructure:"severity_level"`
+	SamplingPercentage float32 `mapstructure:"sampling_percentage"`
+}
+
+// Config has the configuration guiding the sampler processor.
 type Config struct {
 	config.ProcessorSettings `mapstructure:",squash"` // squash ensures fields are correctly decoded in embedded struct
 
-	// SamplingPercentage is the percentage rate at which traces are going to be sampled. Defaults to zero, i.e.: no sample.
-	// Values greater or equal 100 are treated as "sample all traces".
+	// SamplingPercentage is the percentage rate at which traces or logs are going to be sampled. Defaults to zero, i.e.: no sample.
+	// Values greater or equal 100 are treated as "sample all traces/logs".
 	SamplingPercentage float32 `mapstructure:"sampling_percentage"`
 
 	// HashSeed allows one to configure the hashing seed. This is important in scenarios where multiple layers of collectors
 	// have different sampling rates: if they use the same seed all passing one layer may pass the other even if they have
 	// different sampling rates, configuring different seeds avoids that.
 	HashSeed uint32 `mapstructure:"hash_seed"`
+
+	// Severity is an array of severity and sampling percentage pairs allocating a specific sampling percentage
+	// to a given severity level.
+	Severity []severityPair `mapstructure:"severity"`
 }
 
 var _ config.Processor = (*Config)(nil)
 
 // Validate checks if the processor configuration is valid
 func (cfg *Config) Validate() error {
+	keys := map[string]bool{}
+	for _, pair := range cfg.Severity {
+		if _, ok := severityTextToNum[pair.Level]; !ok {
+			return fmt.Errorf("unrecognized severity level: %s", pair.Level)
+		}
+		if keys[pair.Level] {
+			return fmt.Errorf("severity already used: %s", pair.Level)
+		}
+		keys[pair.Level] = true
+	}
 	return nil
 }
diff --git a/processor/probabilisticsamplerprocessor/config_test.go b/processor/probabilisticsamplerprocessor/config_test.go
index 09e7eeedf934..96f7c4a1344d 100644
--- a/processor/probabilisticsamplerprocessor/config_test.go
+++ b/processor/probabilisticsamplerprocessor/config_test.go
@@ -42,7 +42,17 @@ func TestLoadConfig(t *testing.T) {
 			SamplingPercentage: 15.3,
 			HashSeed:           22,
 		})
-
+	p1 := cfg.Processors[config.NewComponentIDWithName(typeStr, "logs")]
+	assert.Equal(t,
+		&Config{
+			ProcessorSettings:  config.NewProcessorSettings(config.NewComponentIDWithName(typeStr, "logs")),
+			SamplingPercentage: 15.3,
+			HashSeed:           22,
+			Severity: []severityPair{
+				{Level: "error", SamplingPercentage: 100},
+				{Level: "warn", SamplingPercentage: 80},
+			},
+		}, p1)
 }
 
 func TestLoadConfigEmpty(t *testing.T) {
@@ -59,3 +69,14 @@ func TestLoadConfigEmpty(t *testing.T) {
 	p0 := cfg.Processors[config.NewComponentID(typeStr)]
 	assert.Equal(t, p0, createDefaultConfig())
 }
+
+func TestLoadInvalidConfig(t *testing.T) {
+	factories, err := componenttest.NopFactories()
+	require.NoError(t, err)
+
+	factory := NewFactory()
+	factories.Processors[typeStr] = factory
+
+	_, err = servicetest.LoadConfigAndValidate(filepath.Join("testdata", "invalid.yaml"), factories)
+	require.ErrorContains(t, err, "severity already used: error")
+}
diff --git a/processor/probabilisticsamplerprocessor/factory.go b/processor/probabilisticsamplerprocessor/factory.go
index 71f16f192c03..2109739e18ec 100644
--- a/processor/probabilisticsamplerprocessor/factory.go
+++ b/processor/probabilisticsamplerprocessor/factory.go
@@ -32,7 +32,8 @@ func NewFactory() component.ProcessorFactory {
 	return component.NewProcessorFactory(
 		typeStr,
 		createDefaultConfig,
-		component.WithTracesProcessor(createTracesProcessor))
+		component.WithTracesProcessor(createTracesProcessor),
+		component.WithLogsProcessor(createLogsProcessor))
 }
 
 func createDefaultConfig() config.Processor {
@@ -50,3 +51,13 @@ func createTracesProcessor(
 ) (component.TracesProcessor, error) {
 	return newTracesProcessor(nextConsumer, cfg.(*Config))
 }
+
+// createLogsProcessor creates a log processor based on this config.
+func createLogsProcessor(
+	_ context.Context,
+	_ component.ProcessorCreateSettings,
+	cfg config.Processor,
+	nextConsumer consumer.Logs,
+) (component.LogsProcessor, error) {
+	return newLogsProcessor(nextConsumer, cfg.(*Config))
+}
diff --git a/processor/probabilisticsamplerprocessor/factory_test.go b/processor/probabilisticsamplerprocessor/factory_test.go
index e5e0a9f154f0..72fb15e454d8 100644
--- a/processor/probabilisticsamplerprocessor/factory_test.go
+++ b/processor/probabilisticsamplerprocessor/factory_test.go
@@ -37,3 +37,11 @@ func TestCreateProcessor(t *testing.T) {
 	assert.NotNil(t, tp)
 	assert.NoError(t, err, "cannot create trace processor")
 }
+
+func TestCreateProcessorLogs(t *testing.T) {
+	cfg := createDefaultConfig()
+	set := componenttest.NewNopProcessorCreateSettings()
+	tp, err := createLogsProcessor(context.Background(), set, cfg, consumertest.NewNop())
+	assert.NotNil(t, tp)
+	assert.NoError(t, err, "cannot create logs processor")
+}
diff --git a/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go b/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
new file mode 100644
index 000000000000..1fcb9a5e546c
--- /dev/null
+++ b/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
@@ -0,0 +1,95 @@
+// Copyright The OpenTelemetry Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package probabilisticsamplerprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/probabilisticsamplerprocessor"
+
+import (
+	"context"
+	"sort"
+
+	"go.opentelemetry.io/collector/component"
+	"go.opentelemetry.io/collector/consumer"
+	"go.opentelemetry.io/collector/model/pdata"
+	"go.opentelemetry.io/collector/processor/processorhelper"
+)
+
+type severitySamplingRate struct {
+	level              pdata.SeverityNumber
+	scaledSamplingRate uint32
+}
+
+type logsamplerprocessor struct {
+	samplingRates []*severitySamplingRate
+	hashSeed      uint32
+}
+
+// newLogsProcessor returns a processor.LogsProcessor that will perform head sampling according to the given
+// configuration.
+func newLogsProcessor(nextConsumer consumer.Logs, cfg *Config) (component.LogsProcessor, error) {
+
+	severitySamplingRates := []*severitySamplingRate{
+		{level: pdata.SeverityNumberUNDEFINED, scaledSamplingRate: uint32(cfg.SamplingPercentage * percentageScaleFactor)},
+	}
+	sort.SliceStable(cfg.Severity, func(i, j int) bool {
+		return severityTextToNum[cfg.Severity[i].Level] < severityTextToNum[cfg.Severity[j].Level]
+	})
+	for _, pair := range cfg.Severity {
+		newRate := &severitySamplingRate{level: severityTextToNum[pair.Level],
+			scaledSamplingRate: uint32(pair.SamplingPercentage * percentageScaleFactor),
+		}
+		severitySamplingRates = append(severitySamplingRates, newRate)
+	}
+
+	lsp := &logsamplerprocessor{
+		samplingRates: severitySamplingRates,
+		hashSeed:      cfg.HashSeed,
+	}
+
+	return processorhelper.NewLogsProcessor(
+		cfg,
+		nextConsumer,
+		lsp.processLogs,
+		processorhelper.WithCapabilities(consumer.Capabilities{MutatesData: true}))
+}
+
+func (lsp *logsamplerprocessor) processLogs(_ context.Context, ld pdata.Logs) (pdata.Logs, error) {
+	ld.ResourceLogs().RemoveIf(func(rl pdata.ResourceLogs) bool {
+		rl.ScopeLogs().RemoveIf(func(ill pdata.ScopeLogs) bool {
+			ill.LogRecords().RemoveIf(func(l pdata.LogRecord) bool {
+
+				// find the correct severity sampling level.
+				var selectedSamplingRate *severitySamplingRate
+				for _, ssr := range lsp.samplingRates {
+					if ssr.level > l.SeverityNumber() {
+						break
+					}
+					selectedSamplingRate = ssr
+				}
+
+				// Create an id for the log record by combining the timestamp and severity text.
+				lidBytes := []byte(l.Timestamp().String() + l.SeverityText())
+				sampled := hash(lidBytes[:], lsp.hashSeed)&bitMaskHashBuckets < selectedSamplingRate.scaledSamplingRate
+				return !sampled
+			})
+			// Filter out empty ScopeLogs
+			return ill.LogRecords().Len() == 0
+		})
+		// Filter out empty ResourceLogs
+		return rl.ScopeLogs().Len() == 0
+	})
+	if ld.ResourceLogs().Len() == 0 {
+		return ld, processorhelper.ErrSkipProcessingData
+	}
+	return ld, nil
+}
diff --git a/processor/probabilisticsamplerprocessor/logprobabilisticsampler_test.go b/processor/probabilisticsamplerprocessor/logprobabilisticsampler_test.go
new file mode 100644
index 000000000000..2c340560bbd2
--- /dev/null
+++ b/processor/probabilisticsamplerprocessor/logprobabilisticsampler_test.go
@@ -0,0 +1,155 @@
+// Copyright The OpenTelemetry Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package probabilisticsamplerprocessor
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/collector/config"
+	"go.opentelemetry.io/collector/consumer"
+	"go.opentelemetry.io/collector/consumer/consumertest"
+	"go.opentelemetry.io/collector/model/pdata"
+)
+
+func TestNewLogsProcessor(t *testing.T) {
+	tests := []struct {
+		name         string
+		nextConsumer consumer.Logs
+		cfg          *Config
+		wantErr      bool
+	}{
+		{
+			name: "nil_nextConsumer",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 15.5,
+			},
+			wantErr: true,
+		},
+		{
+			name:         "happy_path",
+			nextConsumer: consumertest.NewNop(),
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 15.5,
+			},
+		},
+		{
+			name:         "happy_path_hash_seed",
+			nextConsumer: consumertest.NewNop(),
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 13.33,
+				HashSeed:           4321,
+			},
+		},
+		{
+			name:         "with_severity",
+			nextConsumer: consumertest.NewNop(),
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 13.33,
+				HashSeed:           4321,
+				Severity: []severityPair{
+					{"error", 90},
+				},
+			},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := newLogsProcessor(tt.nextConsumer, tt.cfg)
+			if tt.wantErr {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+				assert.NotNil(t, got)
+			}
+		})
+	}
+}
+
+func TestLogsSampling(t *testing.T) {
+	tests := []struct {
+		name     string
+		cfg      *Config
+		received int
+	}{
+		{
+			name: "happy_path",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 100,
+			},
+			received: 2,
+		},
+		{
+			name: "nothing",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 0,
+			},
+			received: 0,
+		},
+		{
+			name: "half",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 49,
+			},
+			received: 1,
+		},
+		{
+			name: "nothing_except_errors",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 0,
+				Severity: []severityPair{
+					{"error", 100},
+				},
+			},
+			received: 1,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			sink := new(consumertest.LogsSink)
+			processor, err := newLogsProcessor(sink, tt.cfg)
+			require.NoError(t, err)
+			logs := pdata.NewLogs()
+			lr := logs.ResourceLogs().AppendEmpty().ScopeLogs().AppendEmpty().LogRecords()
+			for i := 0; i < 5; i++ {
+				first := lr.AppendEmpty()
+				first.SetTimestamp(pdata.Timestamp(time.Unix(1649400860, 0).Unix()))
+				first.SetSeverityNumber(pdata.SeverityNumberDEBUG)
+				second := lr.AppendEmpty()
+				second.SetTimestamp(pdata.Timestamp(time.Unix(12345555432, 0).Unix()))
+				second.SetSeverityNumber(pdata.SeverityNumberERROR)
+			}
+			err = processor.ConsumeLogs(context.Background(), logs)
+			require.NoError(t, err)
+			sunk := sink.AllLogs()
+			numReceived := 0
+			if len(sunk) > 0 && sunk[0].ResourceLogs().Len() > 0 {
+				numReceived = sunk[0].ResourceLogs().At(0).ScopeLogs().At(0).LogRecords().Len()
+			}
+			assert.Equal(t, tt.received*5, numReceived)
+		})
+	}
+}
diff --git a/processor/probabilisticsamplerprocessor/testdata/config.yaml b/processor/probabilisticsamplerprocessor/testdata/config.yaml
index a421c8258bbf..5e8f19233d55 100644
--- a/processor/probabilisticsamplerprocessor/testdata/config.yaml
+++ b/processor/probabilisticsamplerprocessor/testdata/config.yaml
@@ -24,6 +24,31 @@ processors:
     # intended.
     hash_seed: 22
 
+  probabilistic_sampler/logs:
+    # the percentage rate at which logs are going to be sampled. Defaults to
+    # zero, i.e.: no sample. Values greater or equal 100 are treated as
+    # "sample all logs".
+    sampling_percentage: 15.3
+    # hash_seed allows one to configure the hashing seed. This is important in
+    # scenarios where multiple layers of collectors are used to achieve the
+    # desired sampling rate, eg.: 10% on first layer and 10% on the
+    # second, resulting in an overall sampling rate of 1% (10% x 10%).
+    # If all layers use the same seed, all data passing one layer will also pass
+    # the next one, independent of the configured sampling rate. Having different
+    # seeds at different layers ensures that sampling rate in each layer work as
+    # intended.
+    hash_seed: 22
+    # Severity filters override the default sampling_percentage sampling rate.
+    severity:
+      # Override the default sampling_percentage for all log records of severity of error or higher
+      # to keep all logs.
+      - sampling_percentage: 100
+        severity_level: error
+        # Override the default sampling_percentage for all log records of severity of warn or higher
+        # to keep 80% of logs. Note this doesn't override the error logs sampling rate.
+      - sampling_percentage: 80
+        severity_level: warn
+
 exporters:
   nop:
 
@@ -33,3 +58,7 @@ service:
       receivers: [nop]
       processors: [probabilistic_sampler]
       exporters: [nop]
+    logs:
+      receivers: [ nop ]
+      processors: [ probabilistic_sampler/logs ]
+      exporters: [ nop ]
diff --git a/processor/probabilisticsamplerprocessor/testdata/invalid.yaml b/processor/probabilisticsamplerprocessor/testdata/invalid.yaml
new file mode 100644
index 000000000000..f252ae919cdb
--- /dev/null
+++ b/processor/probabilisticsamplerprocessor/testdata/invalid.yaml
@@ -0,0 +1,24 @@
+receivers:
+  nop:
+
+processors:
+
+  probabilistic_sampler/logs:
+    sampling_percentage: 15.3
+    hash_seed: 22
+    severity:
+      - sampling_percentage: 100
+        severity_level: error
+        # Duplicate severity level sampling rate!
+      - sampling_percentage: 80
+        severity_level: error
+
+exporters:
+  nop:
+
+service:
+  pipelines:
+    logs:
+      receivers: [ nop ]
+      processors: [ probabilistic_sampler/logs ]
+      exporters: [ nop ]

From e64b06aaa097c0a61d237e724e7d8df69941c329 Mon Sep 17 00:00:00 2001
From: Antoine Toulme <antoine@lunar-ocean.com>
Date: Fri, 8 Apr 2022 16:33:29 -0700
Subject: [PATCH 2/5] code review: validator for negative rates

---
 processor/probabilisticsamplerprocessor/config.go  |  6 ++++++
 .../probabilisticsamplerprocessor/config_test.go   | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/processor/probabilisticsamplerprocessor/config.go b/processor/probabilisticsamplerprocessor/config.go
index 226818ee8eb1..845d705d6b95 100644
--- a/processor/probabilisticsamplerprocessor/config.go
+++ b/processor/probabilisticsamplerprocessor/config.go
@@ -76,11 +76,17 @@ var _ config.Processor = (*Config)(nil)
 
 // Validate checks if the processor configuration is valid
 func (cfg *Config) Validate() error {
+	if cfg.SamplingPercentage < 0 {
+		return fmt.Errorf("negative sampling rate: %.2f", cfg.SamplingPercentage)
+	}
 	keys := map[string]bool{}
 	for _, pair := range cfg.Severity {
 		if _, ok := severityTextToNum[pair.Level]; !ok {
 			return fmt.Errorf("unrecognized severity level: %s", pair.Level)
 		}
+		if pair.SamplingPercentage < 0 {
+			return fmt.Errorf("negative sampling rate: %.2f [%s]", pair.SamplingPercentage, pair.Level)
+		}
 		if keys[pair.Level] {
 			return fmt.Errorf("severity already used: %s", pair.Level)
 		}
diff --git a/processor/probabilisticsamplerprocessor/config_test.go b/processor/probabilisticsamplerprocessor/config_test.go
index 96f7c4a1344d..0300b3a7e37a 100644
--- a/processor/probabilisticsamplerprocessor/config_test.go
+++ b/processor/probabilisticsamplerprocessor/config_test.go
@@ -80,3 +80,17 @@ func TestLoadInvalidConfig(t *testing.T) {
 	_, err = servicetest.LoadConfigAndValidate(filepath.Join("testdata", "invalid.yaml"), factories)
 	require.ErrorContains(t, err, "severity already used: error")
 }
+
+func TestNegativeSamplingRate(t *testing.T) {
+	cfg := createDefaultConfig()
+	cfg.(*Config).SamplingPercentage = -5
+	err := cfg.Validate()
+	require.ErrorContains(t, err, "negative sampling rate: -5.00")
+
+	cfg = createDefaultConfig()
+	cfg.(*Config).Severity = []severityPair{
+		{Level: "error", SamplingPercentage: -4.344},
+	}
+	err = cfg.Validate()
+	require.ErrorContains(t, err, "negative sampling rate: -4.34 [error]")
+}

From fefae2bb3afb306c28ba1a867bcdbad133daae26 Mon Sep 17 00:00:00 2001
From: Antoine Toulme <antoine@lunar-ocean.com>
Date: Sat, 9 Apr 2022 15:38:14 -0700
Subject: [PATCH 3/5] Redo with a different approach, using attributes

---
 .../probabilisticsamplerprocessor/README.md   | 70 +++++++------
 .../probabilisticsamplerprocessor/config.go   | 59 ++---------
 .../config_test.go                            | 20 +---
 .../logprobabilisticsampler.go                | 58 +++++------
 .../logprobabilisticsampler_test.go           | 97 +++++++++++++------
 .../testdata/config.yaml                      | 16 ++-
 .../testdata/invalid.yaml                     |  8 +-
 7 files changed, 153 insertions(+), 175 deletions(-)

diff --git a/processor/probabilisticsamplerprocessor/README.md b/processor/probabilisticsamplerprocessor/README.md
index 8f046af95e7b..d84490d5d8b8 100644
--- a/processor/probabilisticsamplerprocessor/README.md
+++ b/processor/probabilisticsamplerprocessor/README.md
@@ -2,64 +2,78 @@
 
 Supported pipeline types: traces, logs
 
-The probabilistic sampler supports sampling logs by associating a sampling rate to log severity.
+The probabilistic sampler supports two types of sampling for traces:
 
-A default sampling rate is mandatory. Additionally, additional sampling rates associated with a log severity can be added.
-Any message with a log severity equal or higher to the log severity will adopt the new sampling.
+1. `sampling.priority` [semantic
+convention](https://github.com/opentracing/specification/blob/master/semantic_conventions.md#span-tags-table)
+as defined by OpenTracing
+2. Trace ID hashing
 
-The probabilistic sampler optionally may use a `hash_seed` to compute the hash of a log record.
-This sampler samples based on hash values determined by log records. In order for
-log record hashing to work, all collectors for a given tier (e.g. behind the same load balancer)
+The `sampling.priority` semantic convention takes priority over trace ID hashing. As the name
+implies, trace ID hashing samples based on hash values determined by trace IDs. In order for
+trace ID hashing to work, all collectors for a given tier (e.g. behind the same load balancer)
 must have the same `hash_seed`. It is also possible to leverage a different `hash_seed` at
 different collector tiers to support additional sampling requirements. Please refer to
 [config.go](./config.go) for the config spec.
 
 The following configuration options can be modified:
 - `hash_seed` (no default): An integer used to compute the hash algorithm. Note that all collectors for a given tier (e.g. behind the same load balancer) should have the same hash_seed.
-- `sampling_percentage` (default = 0): Percentage at which logs are sampled; >= 100 samples all logs
-- `severity/severity_level`: `SeverityText` associated with a [severity level](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/logs/data-model.md#displaying-severity)
-- `severity/sampling_percentage` (default = 0): Percentage at which logs are sampled when the severity is equal or higher to the severity text; >= 100 samples all logs
+- `sampling_percentage` (default = 0): Percentage at which traces are sampled; >= 100 samples all traces
+
 Examples:
 
 ```yaml
 processors:
   probabilistic_sampler:
     hash_seed: 22
-    sampling_percentage: 15
-    severity:
-      - severity_level: error
-        sampling_percentage: 100
-      - severity_level: warn
-        sampling_percentage: 75
+    sampling_percentage: 15.3
 ```
 
-The probabilistic sampler supports two types of sampling for traces:
-
-1. `sampling.priority` [semantic
-convention](https://github.com/opentracing/specification/blob/master/semantic_conventions.md#span-tags-table)
-as defined by OpenTracing
-2. Trace ID hashing
+The probabilistic sampler supports sampling logs according to their trace ID, or by a specific log record attribute.
 
-The `sampling.priority` semantic convention takes priority over trace ID hashing. As the name
-implies, trace ID hashing samples based on hash values determined by trace IDs. In order for
-trace ID hashing to work, all collectors for a given tier (e.g. behind the same load balancer)
+The probabilistic sampler optionally may use a `hash_seed` to compute the hash of a log record.
+This sampler samples based on hash values determined by log records. In order for
+log record hashing to work, all collectors for a given tier (e.g. behind the same load balancer)
 must have the same `hash_seed`. It is also possible to leverage a different `hash_seed` at
 different collector tiers to support additional sampling requirements. Please refer to
 [config.go](./config.go) for the config spec.
 
 The following configuration options can be modified:
-- `hash_seed` (no default): An integer used to compute the hash algorithm. Note that all collectors for a given tier (e.g. behind the same load balancer) should have the same hash_seed.
-- `sampling_percentage` (default = 0): Percentage at which traces are sampled; >= 100 samples all traces
+- `hash_seed` (no default, optional): An integer used to compute the hash algorithm. Note that all collectors for a given tier (e.g. behind the same load balancer) should have the same hash_seed.
+- `sampling_percentage` (default = 0, required): Percentage at which logs are sampled; >= 100 samples all logs
+- `trace_id_sampling` (default = true, optional): Whether to use the log record trace ID to sample the log record.
+- `sampling_source` (default = null, optional): The optional name of a log record attribute used for sampling purposes, such as a unique log record ID. The value of the attribute is only used if the trace ID is absent or if `trace_id_sampling` is set to `false`.
+- `sampling_priority` (default = null, optional): The optional name of a log record attribute used to set a different sampling priority from the `sampling_percentage` setting. 0 means to never sample the log record, and >= 100 means to always sample the log record.
 
 Examples:
 
+Sample 15% of the logs:
 ```yaml
 processors:
   probabilistic_sampler:
-    hash_seed: 22
-    sampling_percentage: 15.3
+    sampling_percentage: 15
 ```
 
+Sample logs according to their logID attribute:
+
+```yaml
+processors:
+  probabilistic_sampler:
+    sampling_percentage: 15
+    trace_id_sampling: false
+    sampling_source: logID
+```
+
+Sample logs according to the attribute `priority`:
+
+```yaml
+processors:
+  probabilistic_sampler:
+    sampling_percentage: 15
+    sampling_priority: priority
+```
+
+
 Refer to [config.yaml](./testdata/config.yaml) for detailed
 examples on using the processor.
 
diff --git a/processor/probabilisticsamplerprocessor/config.go b/processor/probabilisticsamplerprocessor/config.go
index 845d705d6b95..3e5d586361bc 100644
--- a/processor/probabilisticsamplerprocessor/config.go
+++ b/processor/probabilisticsamplerprocessor/config.go
@@ -18,42 +18,8 @@ import (
 	"fmt"
 
 	"go.opentelemetry.io/collector/config"
-	"go.opentelemetry.io/collector/model/pdata"
 )
 
-var severityTextToNum = map[string]pdata.SeverityNumber{
-	"default": pdata.SeverityNumberUNDEFINED,
-	"trace":   pdata.SeverityNumberTRACE,
-	"trace2":  pdata.SeverityNumberTRACE2,
-	"trace3":  pdata.SeverityNumberTRACE3,
-	"trace4":  pdata.SeverityNumberTRACE4,
-	"debug":   pdata.SeverityNumberDEBUG,
-	"debug2":  pdata.SeverityNumberDEBUG2,
-	"debug3":  pdata.SeverityNumberDEBUG3,
-	"debug4":  pdata.SeverityNumberDEBUG4,
-	"info":    pdata.SeverityNumberINFO,
-	"info2":   pdata.SeverityNumberINFO2,
-	"info3":   pdata.SeverityNumberINFO3,
-	"info4":   pdata.SeverityNumberINFO4,
-	"warn":    pdata.SeverityNumberWARN,
-	"warn2":   pdata.SeverityNumberWARN2,
-	"warn3":   pdata.SeverityNumberWARN3,
-	"warn4":   pdata.SeverityNumberWARN4,
-	"error":   pdata.SeverityNumberERROR,
-	"error2":  pdata.SeverityNumberERROR2,
-	"error3":  pdata.SeverityNumberERROR3,
-	"error4":  pdata.SeverityNumberERROR4,
-	"fatal":   pdata.SeverityNumberFATAL,
-	"fatal2":  pdata.SeverityNumberFATAL2,
-	"fatal3":  pdata.SeverityNumberFATAL3,
-	"fatal4":  pdata.SeverityNumberFATAL4,
-}
-
-type severityPair struct {
-	Level              string  `mapstructure:"severity_level"`
-	SamplingPercentage float32 `mapstructure:"sampling_percentage"`
-}
-
 // Config has the configuration guiding the sampler processor.
 type Config struct {
 	config.ProcessorSettings `mapstructure:",squash"` // squash ensures fields are correctly decoded in embedded struct
@@ -67,9 +33,15 @@ type Config struct {
 	// different sampling rates, configuring different seeds avoids that.
 	HashSeed uint32 `mapstructure:"hash_seed"`
 
-	// Severity is an array of severity and sampling percentage pairs allocating a specific sampling percentage
-	// to a given severity level.
-	Severity []severityPair `mapstructure:"severity"`
+	// TraceIDEnabled (logs only) allows to choose to use to sample by trace id or by a specific log record attribute.
+	// By default, this option is true.
+	TraceIDEnabled *bool `mapstructure:"trace_id_sampling"`
+	// SamplingSource (logs only) allows to use a log record attribute designed by the `sampling_source` key
+	// to be used to compute the sampling hash of the log record instead of trace id, if trace id is absent or trace id sampling is disabled.
+	SamplingSource string `mapstructure:"sampling_source"`
+	// SamplingPriority (logs only) allows to use a log record attribute designed by the `sampling_priority` key
+	// to be used as the sampling priority of the log record.
+	SamplingPriority string `mapstructure:"sampling_priority"`
 }
 
 var _ config.Processor = (*Config)(nil)
@@ -79,18 +51,5 @@ func (cfg *Config) Validate() error {
 	if cfg.SamplingPercentage < 0 {
 		return fmt.Errorf("negative sampling rate: %.2f", cfg.SamplingPercentage)
 	}
-	keys := map[string]bool{}
-	for _, pair := range cfg.Severity {
-		if _, ok := severityTextToNum[pair.Level]; !ok {
-			return fmt.Errorf("unrecognized severity level: %s", pair.Level)
-		}
-		if pair.SamplingPercentage < 0 {
-			return fmt.Errorf("negative sampling rate: %.2f [%s]", pair.SamplingPercentage, pair.Level)
-		}
-		if keys[pair.Level] {
-			return fmt.Errorf("severity already used: %s", pair.Level)
-		}
-		keys[pair.Level] = true
-	}
 	return nil
 }
diff --git a/processor/probabilisticsamplerprocessor/config_test.go b/processor/probabilisticsamplerprocessor/config_test.go
index 0300b3a7e37a..1727a6d536ac 100644
--- a/processor/probabilisticsamplerprocessor/config_test.go
+++ b/processor/probabilisticsamplerprocessor/config_test.go
@@ -48,10 +48,6 @@ func TestLoadConfig(t *testing.T) {
 			ProcessorSettings:  config.NewProcessorSettings(config.NewComponentIDWithName(typeStr, "logs")),
 			SamplingPercentage: 15.3,
 			HashSeed:           22,
-			Severity: []severityPair{
-				{Level: "error", SamplingPercentage: 100},
-				{Level: "warn", SamplingPercentage: 80},
-			},
 		}, p1)
 }
 
@@ -78,19 +74,5 @@ func TestLoadInvalidConfig(t *testing.T) {
 	factories.Processors[typeStr] = factory
 
 	_, err = servicetest.LoadConfigAndValidate(filepath.Join("testdata", "invalid.yaml"), factories)
-	require.ErrorContains(t, err, "severity already used: error")
-}
-
-func TestNegativeSamplingRate(t *testing.T) {
-	cfg := createDefaultConfig()
-	cfg.(*Config).SamplingPercentage = -5
-	err := cfg.Validate()
-	require.ErrorContains(t, err, "negative sampling rate: -5.00")
-
-	cfg = createDefaultConfig()
-	cfg.(*Config).Severity = []severityPair{
-		{Level: "error", SamplingPercentage: -4.344},
-	}
-	err = cfg.Validate()
-	require.ErrorContains(t, err, "negative sampling rate: -4.34 [error]")
+	require.ErrorContains(t, err, "negative sampling rate: -15.30")
 }
diff --git a/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go b/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
index 1fcb9a5e546c..b4f609c84232 100644
--- a/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
+++ b/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
@@ -16,7 +16,6 @@ package probabilisticsamplerprocessor // import "github.com/open-telemetry/opent
 
 import (
 	"context"
-	"sort"
 
 	"go.opentelemetry.io/collector/component"
 	"go.opentelemetry.io/collector/consumer"
@@ -24,36 +23,24 @@ import (
 	"go.opentelemetry.io/collector/processor/processorhelper"
 )
 
-type severitySamplingRate struct {
-	level              pdata.SeverityNumber
-	scaledSamplingRate uint32
-}
-
 type logsamplerprocessor struct {
-	samplingRates []*severitySamplingRate
-	hashSeed      uint32
+	scaledSamplingRate uint32
+	hashSeed           uint32
+	traceIdEnabled     bool
+	samplingSource     string
+	samplingPriority   string
 }
 
 // newLogsProcessor returns a processor.LogsProcessor that will perform head sampling according to the given
 // configuration.
 func newLogsProcessor(nextConsumer consumer.Logs, cfg *Config) (component.LogsProcessor, error) {
 
-	severitySamplingRates := []*severitySamplingRate{
-		{level: pdata.SeverityNumberUNDEFINED, scaledSamplingRate: uint32(cfg.SamplingPercentage * percentageScaleFactor)},
-	}
-	sort.SliceStable(cfg.Severity, func(i, j int) bool {
-		return severityTextToNum[cfg.Severity[i].Level] < severityTextToNum[cfg.Severity[j].Level]
-	})
-	for _, pair := range cfg.Severity {
-		newRate := &severitySamplingRate{level: severityTextToNum[pair.Level],
-			scaledSamplingRate: uint32(pair.SamplingPercentage * percentageScaleFactor),
-		}
-		severitySamplingRates = append(severitySamplingRates, newRate)
-	}
-
 	lsp := &logsamplerprocessor{
-		samplingRates: severitySamplingRates,
-		hashSeed:      cfg.HashSeed,
+		scaledSamplingRate: uint32(cfg.SamplingPercentage * percentageScaleFactor),
+		hashSeed:           cfg.HashSeed,
+		traceIdEnabled:     cfg.TraceIDEnabled == nil || *cfg.TraceIDEnabled,
+		samplingPriority:   cfg.SamplingPriority,
+		samplingSource:     cfg.SamplingSource,
 	}
 
 	return processorhelper.NewLogsProcessor(
@@ -68,18 +55,25 @@ func (lsp *logsamplerprocessor) processLogs(_ context.Context, ld pdata.Logs) (p
 		rl.ScopeLogs().RemoveIf(func(ill pdata.ScopeLogs) bool {
 			ill.LogRecords().RemoveIf(func(l pdata.LogRecord) bool {
 
-				// find the correct severity sampling level.
-				var selectedSamplingRate *severitySamplingRate
-				for _, ssr := range lsp.samplingRates {
-					if ssr.level > l.SeverityNumber() {
-						break
+				// pick the sampling source.
+				var lidBytes []byte
+				if lsp.traceIdEnabled && !l.TraceID().IsEmpty() {
+					value := l.TraceID().Bytes()
+					lidBytes = value[:]
+				}
+				if lidBytes == nil && lsp.samplingSource != "" {
+					if value, ok := l.Attributes().Get(lsp.samplingSource); ok {
+						lidBytes = value.BytesVal()
+					}
+				}
+				priority := lsp.scaledSamplingRate
+				if lsp.samplingPriority != "" {
+					if localPriority, ok := l.Attributes().Get(lsp.samplingPriority); ok {
+						priority = uint32(localPriority.DoubleVal() * percentageScaleFactor)
 					}
-					selectedSamplingRate = ssr
 				}
 
-				// Create an id for the log record by combining the timestamp and severity text.
-				lidBytes := []byte(l.Timestamp().String() + l.SeverityText())
-				sampled := hash(lidBytes[:], lsp.hashSeed)&bitMaskHashBuckets < selectedSamplingRate.scaledSamplingRate
+				sampled := hash(lidBytes, lsp.hashSeed)&bitMaskHashBuckets < priority
 				return !sampled
 			})
 			// Filter out empty ScopeLogs
diff --git a/processor/probabilisticsamplerprocessor/logprobabilisticsampler_test.go b/processor/probabilisticsamplerprocessor/logprobabilisticsampler_test.go
index 2c340560bbd2..e99cb70d2f77 100644
--- a/processor/probabilisticsamplerprocessor/logprobabilisticsampler_test.go
+++ b/processor/probabilisticsamplerprocessor/logprobabilisticsampler_test.go
@@ -59,18 +59,6 @@ func TestNewLogsProcessor(t *testing.T) {
 				HashSeed:           4321,
 			},
 		},
-		{
-			name:         "with_severity",
-			nextConsumer: consumertest.NewNop(),
-			cfg: &Config{
-				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
-				SamplingPercentage: 13.33,
-				HashSeed:           4321,
-				Severity: []severityPair{
-					{"error", 90},
-				},
-			},
-		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -97,7 +85,7 @@ func TestLogsSampling(t *testing.T) {
 				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
 				SamplingPercentage: 100,
 			},
-			received: 2,
+			received: 100,
 		},
 		{
 			name: "nothing",
@@ -108,23 +96,62 @@ func TestLogsSampling(t *testing.T) {
 			received: 0,
 		},
 		{
-			name: "half",
+			name: "roughly half",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 50,
+			},
+			received: 52,
+		},
+		{
+			name: "sampling_source no sampling",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 0,
+				TraceIDEnabled:     boolPtr(false),
+				SamplingSource:     "foo",
+			},
+			received: 0,
+		},
+		{
+			name: "sampling_source all sampling",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 100,
+				TraceIDEnabled:     boolPtr(false),
+				SamplingSource:     "foo",
+			},
+			received: 100,
+		},
+		{
+			name: "sampling_source sampling",
 			cfg: &Config{
 				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
-				SamplingPercentage: 49,
+				SamplingPercentage: 50,
+				TraceIDEnabled:     boolPtr(false),
+				SamplingSource:     "foo",
 			},
-			received: 1,
+			received: 79,
 		},
 		{
-			name: "nothing_except_errors",
+			name: "sampling_priority",
 			cfg: &Config{
 				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
 				SamplingPercentage: 0,
-				Severity: []severityPair{
-					{"error", 100},
-				},
+				SamplingPriority:   "priority",
 			},
-			received: 1,
+			received: 25,
+		},
+		{
+			name: "sampling_priority with sampling field",
+			cfg: &Config{
+				ProcessorSettings:  config.NewProcessorSettings(config.NewComponentID(typeStr)),
+				SamplingPercentage: 0,
+				TraceIDEnabled:     boolPtr(false),
+				SamplingSource:     "foo",
+				SamplingPriority:   "priority",
+			},
+			received: 25,
 		},
 	}
 	for _, tt := range tests {
@@ -134,13 +161,21 @@ func TestLogsSampling(t *testing.T) {
 			require.NoError(t, err)
 			logs := pdata.NewLogs()
 			lr := logs.ResourceLogs().AppendEmpty().ScopeLogs().AppendEmpty().LogRecords()
-			for i := 0; i < 5; i++ {
-				first := lr.AppendEmpty()
-				first.SetTimestamp(pdata.Timestamp(time.Unix(1649400860, 0).Unix()))
-				first.SetSeverityNumber(pdata.SeverityNumberDEBUG)
-				second := lr.AppendEmpty()
-				second.SetTimestamp(pdata.Timestamp(time.Unix(12345555432, 0).Unix()))
-				second.SetSeverityNumber(pdata.SeverityNumberERROR)
+			for i := 0; i < 100; i++ {
+				record := lr.AppendEmpty()
+				record.SetTimestamp(pdata.Timestamp(time.Unix(1649400860, 0).Unix()))
+				record.SetSeverityNumber(pdata.SeverityNumberDEBUG)
+				ib := byte(i)
+				traceID := [16]byte{0, 0, 0, 0, 0, 0, 0, 0, ib, ib, ib, ib, ib, ib, ib, ib}
+				record.SetTraceID(pdata.NewTraceID(traceID))
+				// set half of records with a foo attribute
+				if i%2 == 0 {
+					record.Attributes().InsertBytes("foo", traceID[:])
+				}
+				// set a fourth of records with a priority attribute
+				if i%4 == 0 {
+					record.Attributes().InsertDouble("priority", 100)
+				}
 			}
 			err = processor.ConsumeLogs(context.Background(), logs)
 			require.NoError(t, err)
@@ -149,7 +184,11 @@ func TestLogsSampling(t *testing.T) {
 			if len(sunk) > 0 && sunk[0].ResourceLogs().Len() > 0 {
 				numReceived = sunk[0].ResourceLogs().At(0).ScopeLogs().At(0).LogRecords().Len()
 			}
-			assert.Equal(t, tt.received*5, numReceived)
+			assert.Equal(t, tt.received, numReceived)
 		})
 	}
 }
+
+func boolPtr(b bool) *bool {
+	return &b
+}
diff --git a/processor/probabilisticsamplerprocessor/testdata/config.yaml b/processor/probabilisticsamplerprocessor/testdata/config.yaml
index 5e8f19233d55..86568d5029cd 100644
--- a/processor/probabilisticsamplerprocessor/testdata/config.yaml
+++ b/processor/probabilisticsamplerprocessor/testdata/config.yaml
@@ -38,16 +38,12 @@ processors:
     # seeds at different layers ensures that sampling rate in each layer work as
     # intended.
     hash_seed: 22
-    # Severity filters override the default sampling_percentage sampling rate.
-    severity:
-      # Override the default sampling_percentage for all log records of severity of error or higher
-      # to keep all logs.
-      - sampling_percentage: 100
-        severity_level: error
-        # Override the default sampling_percentage for all log records of severity of warn or higher
-        # to keep 80% of logs. Note this doesn't override the error logs sampling rate.
-      - sampling_percentage: 80
-        severity_level: warn
+    # sampling_source allows to use a log record attribute after the `foo` key
+    # to be used to compute the sampling hash of the log record instead of trace id, if trace id is absent or trace id sampling is disabled.
+    sampling_source: "foo"
+    # sampling_priority allows to use a log record attribute designed by the `bar` key
+    # to be used as the sampling priority of the log record.
+    sampling_priority: "bar"
 
 exporters:
   nop:
diff --git a/processor/probabilisticsamplerprocessor/testdata/invalid.yaml b/processor/probabilisticsamplerprocessor/testdata/invalid.yaml
index f252ae919cdb..ffd9b1e07d16 100644
--- a/processor/probabilisticsamplerprocessor/testdata/invalid.yaml
+++ b/processor/probabilisticsamplerprocessor/testdata/invalid.yaml
@@ -4,14 +4,8 @@ receivers:
 processors:
 
   probabilistic_sampler/logs:
-    sampling_percentage: 15.3
+    sampling_percentage: -15.3
     hash_seed: 22
-    severity:
-      - sampling_percentage: 100
-        severity_level: error
-        # Duplicate severity level sampling rate!
-      - sampling_percentage: 80
-        severity_level: error
 
 exporters:
   nop:

From b139384765110dab7f42b90f2987796554259d48 Mon Sep 17 00:00:00 2001
From: Antoine Toulme <antoine@lunar-ocean.com>
Date: Mon, 11 Apr 2022 14:38:40 -0700
Subject: [PATCH 4/5] fix lint

---
 .../logprobabilisticsampler.go                              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go b/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
index b4f609c84232..5142509ff2aa 100644
--- a/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
+++ b/processor/probabilisticsamplerprocessor/logprobabilisticsampler.go
@@ -26,7 +26,7 @@ import (
 type logsamplerprocessor struct {
 	scaledSamplingRate uint32
 	hashSeed           uint32
-	traceIdEnabled     bool
+	traceIDEnabled     bool
 	samplingSource     string
 	samplingPriority   string
 }
@@ -38,7 +38,7 @@ func newLogsProcessor(nextConsumer consumer.Logs, cfg *Config) (component.LogsPr
 	lsp := &logsamplerprocessor{
 		scaledSamplingRate: uint32(cfg.SamplingPercentage * percentageScaleFactor),
 		hashSeed:           cfg.HashSeed,
-		traceIdEnabled:     cfg.TraceIDEnabled == nil || *cfg.TraceIDEnabled,
+		traceIDEnabled:     cfg.TraceIDEnabled == nil || *cfg.TraceIDEnabled,
 		samplingPriority:   cfg.SamplingPriority,
 		samplingSource:     cfg.SamplingSource,
 	}
@@ -57,7 +57,7 @@ func (lsp *logsamplerprocessor) processLogs(_ context.Context, ld pdata.Logs) (p
 
 				// pick the sampling source.
 				var lidBytes []byte
-				if lsp.traceIdEnabled && !l.TraceID().IsEmpty() {
+				if lsp.traceIDEnabled && !l.TraceID().IsEmpty() {
 					value := l.TraceID().Bytes()
 					lidBytes = value[:]
 				}

From 990212c7776f52c3ee60421eb5bb66083e2cedf1 Mon Sep 17 00:00:00 2001
From: Alex Boten <alex@boten.ca>
Date: Thu, 14 Apr 2022 08:45:19 -0700
Subject: [PATCH 5/5] Update CHANGELOG.md

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8af241c60ced..095c974726b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,7 +10,6 @@
 
 ### 💡 Enhancements 💡
 
-- `cmd/mdatagen`: Update documentation generated for attributes to list enumerated values and show the "value" that will be visible on metrics when it is different from the attribute key in metadata.yaml (#8983)
 - `probabilistic_sampler`: Add ability to sample logs (#9118)
 
 ### 🧰 Bug fixes 🧰