Merge pull request #1572 from grafana/limit_datapoints_with_future_ts

replay · web-flow · commit b88c3b842aad · 2019-12-17T14:13:55.000-03:00
Reject datapoints with timestamp too far in the future
diff --git a/docker/docker-chaos/metrictank.ini b/docker/docker-chaos/metrictank.ini
@@ -180,6 +180,10 @@ create-cf = true
 schemas-file = /etc/metrictank/storage-schemas.conf
 # path to storage-aggregation.conf file
 aggregations-file = /etc/metrictank/storage-aggregation.conf
+# enables/disables the enforcement of the future tolerance limitation
+enforce-future-tolerance = true
+# defines until how far in the future we accept datapoints. defined as a percentage fraction of the raw ttl of the matching retention storage schema
+future-tolerance-ratio = 10
 
 ## instrumentation stats ##
 [stats]
diff --git a/docker/docker-cluster-query/metrictank.ini b/docker/docker-cluster-query/metrictank.ini
@@ -180,6 +180,10 @@ create-cf = true
 schemas-file = /etc/metrictank/storage-schemas.conf
 # path to storage-aggregation.conf file
 aggregations-file = /etc/metrictank/storage-aggregation.conf
+# enables/disables the enforcement of the future tolerance limitation
+enforce-future-tolerance = true
+# defines until how far in the future we accept datapoints. defined as a percentage fraction of the raw ttl of the matching retention storage schema
+future-tolerance-ratio = 10
 
 ## instrumentation stats ##
 [stats]
diff --git a/docker/docker-cluster/metrictank.ini b/docker/docker-cluster/metrictank.ini
@@ -180,6 +180,10 @@ create-cf = true
 schemas-file = /etc/metrictank/storage-schemas.conf
 # path to storage-aggregation.conf file
 aggregations-file = /etc/metrictank/storage-aggregation.conf
+# enables/disables the enforcement of the future tolerance limitation
+enforce-future-tolerance = true
+# defines until how far in the future we accept datapoints. defined as a percentage fraction of the raw ttl of the matching retention storage schema
+future-tolerance-ratio = 10
 
 ## instrumentation stats ##
 [stats]
diff --git a/docker/docker-dev-custom-cfg-kafka/metrictank.ini b/docker/docker-dev-custom-cfg-kafka/metrictank.ini
@@ -180,6 +180,10 @@ create-cf = true
 schemas-file = /etc/metrictank/storage-schemas.conf
 # path to storage-aggregation.conf file
 aggregations-file = /etc/metrictank/storage-aggregation.conf
+# enables/disables the enforcement of the future tolerance limitation
+enforce-future-tolerance = true
+# defines until how far in the future we accept datapoints. defined as a percentage fraction of the raw ttl of the matching retention storage schema
+future-tolerance-ratio = 10
 
 ## instrumentation stats ##
 [stats]
diff --git a/docs/config.md b/docs/config.md
@@ -220,6 +220,10 @@ create-cf = true
 schemas-file = /etc/metrictank/storage-schemas.conf
 # path to storage-aggregation.conf file
 aggregations-file = /etc/metrictank/storage-aggregation.conf
+# enables/disables the enforcement of the future tolerance limitation
+enforce-future-tolerance = true
+# defines until how far in the future we accept datapoints. defined as a percentage fraction of the raw ttl of the matching retention storage schema
+future-tolerance-ratio = 10
 ```
 
 ## instrumentation stats ##
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -370,6 +370,10 @@ your (infrequent) updates.  Any points revcieved for a chunk that has already be
 * `tank.discarded.sample-out-of-order`:  
 points that go back in time beyond the scope of the optional reorder window.
 these points will end up being dropped and lost.
+* `tank.discarded.sample-too-far-ahead`:  
+count of points which got discareded because their timestamp
+is too far in the future, beyond the limitation of the future tolerance window defined via the
+retention.future-tolerance-ratio parameter.
 * `tank.discarded.unknown`:  
 points that have been discarded for unknown reasons.
 * `tank.gc_metric`:  
@@ -384,6 +388,11 @@ ts is not older than the 60th datapoint counting from the newest.
 * `tank.persist`:  
 how long it takes to persist a chunk (and chunks preceding it)
 this is subject to backpressure from the store when the store's queue runs full
+* `tank.sample-too-far-ahead`:  
+count of points with a timestamp which is too far in the future,
+beyond the limitation of the future tolerance window defined via the retention.future-tolerance-ratio
+parameter. it also gets increased if the enforcement of the future tolerance is disabled, this is
+useful for prediciting whether data points would get rejected once enforcement gets turned on.
 * `tank.total_points`:  
 the number of points currently held in the in-memory ringbuffer
 * `version.%s`:  
diff --git a/mdata/aggmetric.go b/mdata/aggmetric.go
@@ -43,6 +43,7 @@ type AggMetric struct {
 	aggregators     []*Aggregator
 	dropFirstChunk  bool
 	ingestFromT0    uint32
+	futureTolerance uint32
 	ttl             uint32
 	lastSaveStart   uint32 // last chunk T0 that was added to the write Queue.
 	lastWrite       uint32 // wall clock time of when last point was successfully added (possibly to the ROB)
@@ -61,14 +62,15 @@ func NewAggMetric(store Store, cachePusher cache.CachePusher, key schema.AMKey,
 	ret := retentions.Rets[0]
 
 	m := AggMetric{
-		cachePusher:    cachePusher,
-		store:          store,
-		key:            key,
-		chunkSpan:      ret.ChunkSpan,
-		numChunks:      ret.NumChunks,
-		chunks:         make([]*chunk.Chunk, 0, ret.NumChunks),
-		dropFirstChunk: dropFirstChunk,
-		ttl:            uint32(ret.MaxRetention()),
+		cachePusher:     cachePusher,
+		store:           store,
+		key:             key,
+		chunkSpan:       ret.ChunkSpan,
+		numChunks:       ret.NumChunks,
+		chunks:          make([]*chunk.Chunk, 0, ret.NumChunks),
+		dropFirstChunk:  dropFirstChunk,
+		futureTolerance: uint32(ret.MaxRetention()) * uint32(futureToleranceRatio) / 100,
+		ttl:             uint32(ret.MaxRetention()),
 		// we set LastWrite here to make sure a new Chunk doesn't get immediately
 		// garbage collected right after creating it, before we can push to it.
 		lastWrite: uint32(time.Now().Unix()),
@@ -442,6 +444,22 @@ func (a *AggMetric) Add(ts uint32, val float64) {
 		return
 	}
 
+	// need to check if ts > futureTolerance to prevent that we reject a datapoint
+	// because the ts value has wrapped around the uint32 boundary
+	if ts > a.futureTolerance && int64(ts-a.futureTolerance) > time.Now().Unix() {
+		sampleTooFarAhead.Inc()
+
+		if enforceFutureTolerance {
+			if log.IsLevelEnabled(log.DebugLevel) {
+				log.Debugf("AM: discarding metric <%d,%f>: timestamp is too far in the future, accepting timestamps up to %d seconds into the future", ts, val, a.futureTolerance)
+			}
+
+			discardedSampleTooFarAhead.Inc()
+			PromDiscardedSamples.WithLabelValues(tooFarAhead, strconv.Itoa(int(a.key.MKey.Org))).Inc()
+			return
+		}
+	}
+
 	a.Lock()
 	defer a.Unlock()
 
diff --git a/mdata/aggmetric_test.go b/mdata/aggmetric_test.go
@@ -334,6 +334,109 @@ func TestAggMetricIngestFrom(t *testing.T) {
 	}
 }
 
+// TestAggMetricFutureTolerance tests whether the future tolerance limit works correctly
+// there is a race condition because it depends on the return value of time.Now().Unix(),
+// realistically it should never fail due to that race condition unless it executes
+// unreasonably slow.
+func TestAggMetricFutureTolerance(t *testing.T) {
+	cluster.Init("default", "test", time.Now(), "http", 6060)
+	cluster.Manager.SetPrimary(true)
+	mockstore.Reset()
+	ret := conf.MustParseRetentions("1s:10m:6h:5:true")
+
+	_futureToleranceRatio := futureToleranceRatio
+	_enforceFutureTolerance := enforceFutureTolerance
+	discardedSampleTooFarAhead.SetUint32(0)
+	sampleTooFarAhead.SetUint32(0)
+	defer func() {
+		futureToleranceRatio = _futureToleranceRatio
+		enforceFutureTolerance = _enforceFutureTolerance
+		discardedSampleTooFarAhead.SetUint32(0)
+		sampleTooFarAhead.SetUint32(0)
+	}()
+
+	// with a raw retention of 600s, this will result in a future tolerance of 60s
+	futureToleranceRatio = 10
+	aggMetricTolerate60 := NewAggMetric(mockstore, &cache.MockCache{}, test.GetAMKey(42), ret, 0, 1, nil, false, false, 0)
+
+	// will not tolerate future datapoints at all
+	futureToleranceRatio = 0
+	aggMetricTolerate0 := NewAggMetric(mockstore, &cache.MockCache{}, test.GetAMKey(42), ret, 0, 1, nil, false, false, 0)
+
+	// add datapoint which is 30 seconds in the future to both aggmetrics, they should both accept it
+	// because enforcement of future tolerance is disabled, but the one with tolerance 0 should increase
+	// the counter of data points that would have been rejected
+	enforceFutureTolerance = false
+	aggMetricTolerate60.Add(uint32(time.Now().Unix()+30), 10)
+	if len(aggMetricTolerate60.chunks) != 1 {
+		t.Fatalf("expected to have 1 chunk in aggmetric, but there were %d", len(aggMetricTolerate60.chunks))
+	}
+	if sampleTooFarAhead.Peek() != 0 {
+		t.Fatalf("expected the sampleTooFarAhead count to be 0, but it was %d", sampleTooFarAhead.Peek())
+	}
+	if discardedSampleTooFarAhead.Peek() != 0 {
+		t.Fatalf("expected the discardedSampleTooFarAhead count to be 0, but it was %d", discardedSampleTooFarAhead.Peek())
+	}
+
+	aggMetricTolerate0.Add(uint32(time.Now().Unix()+30), 10)
+	if len(aggMetricTolerate0.chunks) != 1 {
+		t.Fatalf("expected to have 1 chunk in aggmetric, but there were %d", len(aggMetricTolerate0.chunks))
+	}
+	if sampleTooFarAhead.Peek() != 1 {
+		t.Fatalf("expected the sampleTooFarAhead count to be 1, but it was %d", sampleTooFarAhead.Peek())
+	}
+	if discardedSampleTooFarAhead.Peek() != 0 {
+		t.Fatalf("expected the discardedSampleTooFarAhead count to be 0, but it was %d", discardedSampleTooFarAhead.Peek())
+	}
+
+	// enable the enforcement of the future tolerance limit and re-initialize the two agg metrics
+	// then add a data point with time stamp 30 sec in the future to both aggmetrics again.
+	// this time only the one that tolerates up to 60 secs should accept the datapoint.
+	discardedSampleTooFarAhead.SetUint32(0)
+	sampleTooFarAhead.SetUint32(0)
+	enforceFutureTolerance = true
+	futureToleranceRatio = 10
+	aggMetricTolerate60 = NewAggMetric(mockstore, &cache.MockCache{}, test.GetAMKey(42), ret, 0, 1, nil, false, false, 0)
+	futureToleranceRatio = 0
+	aggMetricTolerate0 = NewAggMetric(mockstore, &cache.MockCache{}, test.GetAMKey(42), ret, 0, 1, nil, false, false, 0)
+
+	aggMetricTolerate60.Add(uint32(time.Now().Unix()+30), 10)
+	if len(aggMetricTolerate60.chunks) != 1 {
+		t.Fatalf("expected to have 1 chunk in aggmetric, but there were %d", len(aggMetricTolerate60.chunks))
+	}
+	if sampleTooFarAhead.Peek() != 0 {
+		t.Fatalf("expected the sampleTooFarAhead count to be 0, but it was %d", sampleTooFarAhead.Peek())
+	}
+	if discardedSampleTooFarAhead.Peek() != 0 {
+		t.Fatalf("expected the discardedSampleTooFarAhead count to be 0, but it was %d", discardedSampleTooFarAhead.Peek())
+	}
+
+	aggMetricTolerate0.Add(uint32(time.Now().Unix()+30), 10)
+	if len(aggMetricTolerate0.chunks) != 0 {
+		t.Fatalf("expected to have 0 chunks in aggmetric, but there were %d", len(aggMetricTolerate0.chunks))
+	}
+	if sampleTooFarAhead.Peek() != 1 {
+		t.Fatalf("expected the sampleTooFarAhead count to be 1, but it was %d", sampleTooFarAhead.Peek())
+	}
+	if discardedSampleTooFarAhead.Peek() != 1 {
+		t.Fatalf("expected the discardedSampleTooFarAhead count to be 1, but it was %d", discardedSampleTooFarAhead.Peek())
+	}
+
+	// add another datapoint with timestamp of now() to the aggmetric tolerating 0, should be accepted
+	discardedSampleTooFarAhead.SetUint32(0)
+	sampleTooFarAhead.SetUint32(0)
+	aggMetricTolerate0.Add(uint32(time.Now().Unix()), 10)
+	if len(aggMetricTolerate0.chunks) != 1 {
+		t.Fatalf("expected to have 1 chunk in aggmetric, but there were %d", len(aggMetricTolerate0.chunks))
+	}
+	if sampleTooFarAhead.Peek() != 0 {
+		t.Fatalf("expected the sampleTooFarAhead count to be 0, but it was %d", sampleTooFarAhead.Peek())
+	}
+	if discardedSampleTooFarAhead.Peek() != 0 {
+		t.Fatalf("expected the discardedSampleTooFarAhead count to be 0, but it was %d", discardedSampleTooFarAhead.Peek())
+	}
+}
+
 func itersToPoints(iters []tsz.Iter) []schema.Point {
 	var points []schema.Point
 	for _, it := range iters {
diff --git a/mdata/aggmetrics_test.go b/mdata/aggmetrics_test.go
@@ -0,0 +1,107 @@
+package mdata
+
+import (
+	"testing"
+	"time"
+
+	"github.com/grafana/metrictank/conf"
+	"github.com/grafana/metrictank/mdata/cache"
+	"github.com/grafana/metrictank/mdata/chunk"
+	"github.com/grafana/metrictank/schema"
+)
+
+type mockCachePusher struct{}
+
+func (m *mockCachePusher) AddIfHot(_ schema.AMKey, _ uint32, _ chunk.IterGen) {}
+
+func NewMockCachePusher() cache.CachePusher {
+	return &mockCachePusher{}
+}
+
+func TestAggMetricsGetOrCreate(t *testing.T) {
+	mockStore := NewMockStore()
+	mockCachePusher := NewMockCachePusher()
+	ingestFrom := make(map[uint32]int64)
+	chunkMaxStale := uint32(60)
+	metricMaxStale := uint32(120)
+	gcInterval := time.Hour
+
+	_futureToleranceRatio := futureToleranceRatio
+	_aggregations := Aggregations
+	_schemas := Schemas
+	defer func() {
+		futureToleranceRatio = _futureToleranceRatio
+		Aggregations = _aggregations
+		Schemas = _schemas
+	}()
+
+	futureToleranceRatio = 50
+	Aggregations = conf.NewAggregations()
+	Schemas = conf.NewSchemas([]conf.Schema{{
+		Name: "schema1",
+		Retentions: conf.Retentions{
+			Rets: []conf.Retention{
+				{
+					SecondsPerPoint: 10,
+					NumberOfPoints:  360 * 24,
+					ChunkSpan:       600,
+					NumChunks:       2,
+					Ready:           0,
+				}, {
+					SecondsPerPoint: 3600,
+					NumberOfPoints:  24 * 365,
+					ChunkSpan:       24 * 3600,
+					NumChunks:       2,
+					Ready:           0,
+				}},
+		},
+	}})
+
+	aggMetrics := NewAggMetrics(mockStore, mockCachePusher, false, ingestFrom, chunkMaxStale, metricMaxStale, gcInterval)
+
+	testKey1, _ := schema.AMKeyFromString("1.12345678901234567890123456789012")
+	metric := aggMetrics.GetOrCreate(testKey1.MKey, 1, 0, 10).(*AggMetric)
+
+	if metric.store != mockStore {
+		t.Fatalf("Expected metric to have mock store, but it did not")
+	}
+
+	if metric.cachePusher != mockCachePusher {
+		t.Fatalf("Expected metric to have mock cache pusher, but it did not")
+	}
+
+	if metric.key.MKey != testKey1.MKey {
+		t.Fatalf("Expected metric to have test metric key, but it did not")
+	}
+
+	if metric.chunkSpan != 24*3600 {
+		t.Fatalf("Expected metric chunk span to be %d, but it was %d", 24*3600, metric.chunkSpan)
+	}
+
+	if metric.numChunks != 2 {
+		t.Fatalf("Expected metric num chunks to be 2, but it was %d", metric.numChunks)
+	}
+
+	if metric.ttl != 3600*24*365 {
+		t.Fatalf("Expected metric ttl to be %d, but it was %d", 3600*24*365, metric.ttl)
+	}
+
+	// storage schema's maxTTL is 1 year, future tolerance ratio is 50, so our future tolerance should be 1/2 year
+	expectedFutureTolerance := uint32(3600 * 24 * 365 * futureToleranceRatio / 100)
+	if metric.futureTolerance != expectedFutureTolerance {
+		t.Fatalf("Expected future tolerance to be %d, was %d", expectedFutureTolerance, metric.futureTolerance)
+	}
+
+	// verify that two calls to GetOrCreate with the same parameters return the same struct
+	metric2 := aggMetrics.GetOrCreate(testKey1.MKey, 1, 0, 10).(*AggMetric)
+	if metric != metric2 {
+		t.Fatalf("Expected GetOrCreate to return twice the same metric for the same key")
+	}
+
+	futureToleranceRatio = 0
+	testKey2, _ := schema.AMKeyFromString("1.12345678901234567890123456789013")
+	metric3 := aggMetrics.GetOrCreate(testKey2.MKey, 1, 0, 10).(*AggMetric)
+	if metric3.futureTolerance != 0 {
+		t.Fatalf("Future tolerance was expected to be 0, but it was %d", metric3.futureTolerance)
+	}
+}
diff --git a/mdata/init.go b/mdata/init.go
@@ -20,6 +20,7 @@ const (
 	sampleOutOfOrder     = "sample-out-of-order"
 	receivedTooLate      = "received-too-late"
 	newValueForTimestamp = "new-value-for-timestamp"
+	tooFarAhead          = "too-far-in-future"
 )
 
 var (
@@ -39,6 +40,17 @@ var (
 	// these points will end up being dropped and lost.
 	discardedSampleOutOfOrder = stats.NewCounterRate32("tank.discarded.sample-out-of-order")
 
+	// metric tank.discarded.sample-too-far-ahead is count of points which got discareded because their timestamp
+	// is too far in the future, beyond the limitation of the future tolerance window defined via the
+	// retention.future-tolerance-ratio parameter.
+	discardedSampleTooFarAhead = stats.NewCounterRate32("tank.discarded.sample-too-far-ahead")
+
+	// metric tank.sample-too-far-ahead is count of points with a timestamp which is too far in the future,
+	// beyond the limitation of the future tolerance window defined via the retention.future-tolerance-ratio
+	// parameter. it also gets increased if the enforcement of the future tolerance is disabled, this is
+	// useful for prediciting whether data points would get rejected once enforcement gets turned on.
+	sampleTooFarAhead = stats.NewCounterRate32("tank.sample-too-far-ahead")
+
 	// metric tank.discarded.received-too-late is points received for the most recent chunk
 	// when that chunk is already being "closed", ie the end-of-stream marker has been written to the chunk.
 	// this indicates that your GC is actively sealing chunks and saving them before you have the chance to send
@@ -84,8 +96,10 @@ var (
 	Aggregations conf.Aggregations
 	Schemas      conf.Schemas
 
-	schemasFile = "/etc/metrictank/storage-schemas.conf"
-	aggFile     = "/etc/metrictank/storage-aggregation.conf"
+	schemasFile            = "/etc/metrictank/storage-schemas.conf"
+	aggFile                = "/etc/metrictank/storage-aggregation.conf"
+	futureToleranceRatio   = uint(10)
+	enforceFutureTolerance = true
 
 	promActiveMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{
 		Namespace: "metrictank",
@@ -104,6 +118,8 @@ func ConfigSetup() {
 	retentionConf := flag.NewFlagSet("retention", flag.ExitOnError)
 	retentionConf.StringVar(&schemasFile, "schemas-file", "/etc/metrictank/storage-schemas.conf", "path to storage-schemas.conf file")
 	retentionConf.StringVar(&aggFile, "aggregations-file", "/etc/metrictank/storage-aggregation.conf", "path to storage-aggregation.conf file")
+	retentionConf.UintVar(&futureToleranceRatio, "future-tolerance-ratio", 10, "defines until how far in the future we accept datapoints. defined as a percentage fraction of the raw ttl of the matching retention storage schema")
+	retentionConf.BoolVar(&enforceFutureTolerance, "enforce-future-tolerance", true, "enables/disables the enforcement of the future tolerance limitation")
 	globalconf.Register("retention", retentionConf, flag.ExitOnError)
 }
 
diff --git a/metrictank-sample.ini b/metrictank-sample.ini
diff --git a/scripts/config/metrictank-docker.ini b/scripts/config/metrictank-docker.ini
diff --git a/scripts/config/metrictank-package.ini b/scripts/config/metrictank-package.ini