read from first chunks

Dieterbe · Dieterbe · commit 2d12b8558b40 · 2018-08-14T09:55:10.000+02:00
this is an old optimization (?) that has been with us since a long time ago: #74 2029113 here's how it caused data loss at read time: - when only 1 chunk of data had been filled: the "update" of the field is a no-op because len(chunks) == 1, so oldPos goes back to 0 (not sure if intentional or a bug) so reading the first chunk worked. - once you have more than 1 chunk: update of oldPos works. we start hitting cassandra. depending on how long the chunk takes to get saved to cassandra, we will miss data at read time. also, our chunk cache does not cache absence of data, hitting cassandra harder during this period. - once the chunk is saved to cassandra the problem disappears - once the circular buffer recycles the first time (effectively removing the first chunk) this optimization no longer applies, but at that point we still hit cassandra just as before. This problem is now solved. However, removing that code enables another avenue for data loss at read time: - when a read node starts (without data backfill) - or a read node starts with data backfill, but the backfill doesn't have old data for the particular metric, IOW when the data only covers 1 chunk's worth - a read node starts with data backfill, but since backfilling starts at arbitrary positions, the first chunk will miss some data in the beginning. In all these cases, the first chunk is a partial chunk, whereas a full version of the chunk is most likely already in cassandra. To make sure this is not a problem, if the first chunk we used was partial, we set oldest to the first timestamp, so that the rest can be retrieved from cassandra. Typically, this will cause the "same" chunk (but a full version) to be retrieved from cassandra, which is then cached and seamlessly merged via Fix()
diff --git a/mdata/aggmetric.go b/mdata/aggmetric.go
@@ -38,11 +38,11 @@ type AggMetric struct {
 	Chunks          []*chunk.Chunk
 	aggregators     []*Aggregator
 	dropFirstChunk  bool
-	firstChunkT0    uint32
 	ttl             uint32
 	lastSaveStart   uint32 // last chunk T0 that was added to the write Queue.
 	lastSaveFinish  uint32 // last chunk T0 successfully written to Cassandra.
 	lastWrite       uint32
+	firstTs         uint32
 }
 
 // NewAggMetric creates a metric with given key, it retains the given number of chunks each chunkSpan seconds long
@@ -268,27 +268,16 @@ func (a *AggMetric) Get(from, to uint32) (Result, error) {
 		return result, ErrNilChunk
 	}
 
-	// The first chunk is likely only a partial chunk. If we are not the primary node
-	// we should not serve data from this chunk, and should instead get the chunk from cassandra.
-	// if we are the primary node, then there is likely no data in Cassandra anyway.
-	if !cluster.Manager.IsPrimary() && oldestChunk.T0 == a.firstChunkT0 {
-		oldestPos++
-		if oldestPos >= len(a.Chunks) {
-			oldestPos = 0
-		}
-		oldestChunk = a.getChunk(oldestPos)
-		if oldestChunk == nil {
-			log.Error(3, "%s", ErrNilChunk)
-			return result, ErrNilChunk
-		}
-	}
-
 	if to <= oldestChunk.T0 {
 		// the requested time range ends before any data we have.
 		if LogLevel < 2 {
 			log.Debug("AM %s Get(): no data for requested range", a.Key)
 		}
-		result.Oldest = oldestChunk.T0
+		if oldestChunk.First {
+			result.Oldest = a.firstTs
+		} else {
+			result.Oldest = oldestChunk.T0
+		}
 		return result, nil
 	}
 
@@ -342,8 +331,13 @@ func (a *AggMetric) Get(from, to uint32) (Result, error) {
 		}
 	}
 
+	if oldestChunk.First {
+		result.Oldest = a.firstTs
+	} else {
+		result.Oldest = oldestChunk.T0
+	}
+
 	memToIterDuration.Value(time.Now().Sub(pre))
-	result.Oldest = oldestChunk.T0
 	return result, nil
 }
 
@@ -483,12 +477,11 @@ func (a *AggMetric) add(ts uint32, val float64) {
 
 	if len(a.Chunks) == 0 {
 		chunkCreate.Inc()
-		// no data has been added to this metric at all.
-		a.Chunks = append(a.Chunks, chunk.New(t0))
-
-		// The first chunk is typically going to be a partial chunk
-		// so we keep a record of it.
-		a.firstChunkT0 = t0
+		// no data has been added to this AggMetric yet.
+		// note that we may not be aware of prior data that belongs into this chunk
+		// so we should track this cutoff point
+		a.Chunks = append(a.Chunks, chunk.NewFirst(t0))
+		a.firstTs = ts
 
 		if err := a.Chunks[0].Push(ts, val); err != nil {
 			panic(fmt.Sprintf("FATAL ERROR: this should never happen. Pushing initial value <%d,%f> to new chunk at pos 0 failed: %q", ts, val, err))
diff --git a/mdata/chunk/chunk.go b/mdata/chunk/chunk.go
@@ -15,20 +15,25 @@ type Chunk struct {
 	tsz.Series
 	LastTs    uint32 // last TS seen, not computed or anything
 	NumPoints uint32
+	First     bool
 	Closed    bool
 }
 
 func New(t0 uint32) *Chunk {
 	return &Chunk{
-		Series:    *tsz.New(t0),
-		LastTs:    0,
-		NumPoints: 0,
-		Closed:    false,
+		Series: *tsz.New(t0),
+	}
+}
+
+func NewFirst(t0 uint32) *Chunk {
+	return &Chunk{
+		Series: *tsz.New(t0),
+		First:  true,
 	}
 }
 
 func (c *Chunk) String() string {
-	return fmt.Sprintf("<chunk T0=%d, LastTs=%d, NumPoints=%d, Closed=%t>", c.T0, c.LastTs, c.NumPoints, c.Closed)
+	return fmt.Sprintf("<chunk T0=%d, LastTs=%d, NumPoints=%d, First=%t, Closed=%t>", c.T0, c.LastTs, c.NumPoints, c.First, c.Closed)
 
 }
 func (c *Chunk) Push(t uint32, v float64) error {
diff --git a/mdata/result.go b/mdata/result.go
@@ -8,5 +8,5 @@ import (
 type Result struct {
 	Points []schema.Point
 	Iters  []chunk.Iter
-	Oldest uint32
+	Oldest uint32 // timestamp of oldest point we have, to know when and when not we may need to query slower storage
 }

Original file line number	Diff line number	Diff line change
`@@ -8,5 +8,5 @@ import (`
`8`	`8`	`type Result struct {`
`9`	`9`	`Points []schema.Point`
`10`	`10`	`Iters []chunk.Iter`
`11`		`- Oldest uint32`
	`11`	`+ Oldest uint32 // timestamp of oldest point we have, to know when and when not we may need to query slower storage`
`12`	`12`	`}`