revisit copy-on-write series processing

Dieterbe · Dieterbe · commit f99dfefb46a2 · 2019-09-30T15:36:42.000-04:00
* update docs to include tags and meta section
* fix a bunch of bugs where previously data wasn't being deeply copied
* cleanly copy the meta section where needed
* clean up code with some helper functions
diff --git a/api/models/series.go b/api/models/series.go
@@ -16,9 +16,9 @@ import (
 //go:generate msgp
 
 type Series struct {
-	Target       string // for fetched data, set from models.Req.Target, i.e. the metric graphite key. for function output, whatever should be shown as target string (legend)
-	Datapoints   []schema.Point
+	Target       string            // for fetched data, set from models.Req.Target, i.e. the metric graphite key. for function output, whatever should be shown as target string (legend)
 	Tags         map[string]string // Must be set initially via call to `SetTags()`
+	Datapoints   []schema.Point
 	Interval     uint32
 	QueryPatt    string                     // to tie series back to request it came from. e.g. foo.bar.*, or if series outputted by func it would be e.g. scale(foo.bar.*,0.123456)
 	QueryFrom    uint32                     // to tie series back to request it came from
@@ -163,26 +163,50 @@ func (s *Series) buildTargetFromTags() {
 	s.Target = buf.String()
 }
 
+// Copy returns a deep copy.
+// The returned value does not link to the same memory space for any of the properties
 func (s Series) Copy(emptyDatapoints []schema.Point) Series {
-	newSeries := Series{
+	return Series{
 		Target:       s.Target,
-		Datapoints:   emptyDatapoints,
-		Tags:         make(map[string]string, len(s.Tags)),
+		Datapoints:   append(emptyDatapoints, s.Datapoints...),
+		Tags:         s.CopyTags(),
 		Interval:     s.Interval,
 		QueryPatt:    s.QueryPatt,
 		QueryFrom:    s.QueryFrom,
 		QueryTo:      s.QueryTo,
 		QueryCons:    s.QueryCons,
 		Consolidator: s.Consolidator,
+		Meta:         s.Meta.Copy(),
 	}
+}
 
-	newSeries.Datapoints = append(newSeries.Datapoints, s.Datapoints...)
+// CopyBare returns a bare copy.
+// The returned value does not link to the same memory space for any of the properties
+// because it resets all reference types
+func (s Series) CopyBare() Series {
+	s.Datapoints = nil
+	s.Tags = nil
+	s.Meta = nil
+	return s
+}
 
+// CopyTags makes a deep copy of the tags
+func (s *Series) CopyTags() map[string]string {
+	out := make(map[string]string, len(s.Tags))
 	for k, v := range s.Tags {
-		newSeries.Tags[k] = v
+		out[k] = v
 	}
+	return out
+}
 
-	return newSeries
+// CopyTagsWith makes a deep copy of the tags and sets the given tag
+func (s *Series) CopyTagsWith(key, val string) map[string]string {
+	out := make(map[string]string, len(s.Tags)+1)
+	for k, v := range s.Tags {
+		out[k] = v
+	}
+	out[key] = val
+	return out
 }
 
 type SeriesByTarget []Series
diff --git a/devdocs/expr.md b/devdocs/expr.md
@@ -1,19 +1,23 @@
 ## Management of point slices
 
-The `models.Series` attribute `Datapoints []schema.Point` needs special atention:
+The `models.Series` type, even when passed by value, has a few fields that need special attention:
+* `Datapoints []schema.Point`
+* `Tags       map[string]string`
+* `Meta       SeriesMeta`
 
-many processing functions will transform some of the points in datapoint slices. logically speaking, some output values are different than their input values,
-while some may remain the same. they need a place to store their output.
+Many processing functions will want to return an output series that differs from the input, in terms of (some of the) datapoints may have changed value, tags or metadata.
+They need a place to store their output but we cannot simply operate on the input series, or even a copy of it, as the underlying datastructures are shared.
 
 Goals:
-* processing functions should not modify data in slices if those slices need to remain original (e.g. because they're re-used later)
-* minimize allocations of new slices foremost and data copying (if point in= point out) as a smaller concern
+* processing functions should not modify data if that data needs to remain original (e.g. because of re-use of the same input data elsewhere)
+* minimize allocations of new structures foremost
+* minimize data copying as a smaller concern
 * simple code
 
 there's 2 main choices:
 
 1) copy-on-write:
-- each function does not modify data in their inputs, they allocate new slices (or better: get from pool) in which they should store their output point values 
+- each function does not modify data in their inputs, they allocate new structures (or possibly get from pool) if there's differences with input
 - storing output data into new slice can typically be done in same pass as processing the input data
 - if you have lots of processing steps (graphite function calls) in a row, we will be creating more slices and copy data (for unmodified points) than strictly necessary.
 - getting a slice from the pool may cause a stall if it's not large enough and runtime needs to re-allocate and copy
@@ -42,8 +46,8 @@ e.g. an avg of 3 series will create 1 new series (from pool), but won't put the
 another processing step may require the same input data.
 
 function implementations:
-* must not modify existing slices
-* should use the pool to get new slices in which to store their new/modified data.
+* must not modify existing slices or maps or other composite datastructures (at the time of writing, it's only slices/maps)
+* should use the pool to get new slices in which to store their new/modified datapoints.
 * should add said new slices into the cache so it can later be cleaned
 
 ## consolidateBy
diff --git a/expr/func_absolute.go b/expr/func_absolute.go
@@ -35,18 +35,14 @@ func (s *FuncAbsolute) Exec(cache map[Req][]models.Series) ([]models.Series, err
 	for i, serie := range series {
 		transformed := &out[i]
 		transformed.Target = fmt.Sprintf("absolute(%s)", serie.Target)
-		transformed.QueryPatt = fmt.Sprintf("absolute(%s)", serie.QueryPatt)
-		transformed.Tags = make(map[string]string, len(serie.Tags)+1)
+		transformed.Tags = serie.CopyTagsWith("absolute", "1")
 		transformed.Datapoints = pointSlicePool.Get().([]schema.Point)
 		transformed.Interval = serie.Interval
-		transformed.Consolidator = serie.Consolidator
+		transformed.QueryPatt = fmt.Sprintf("absolute(%s)", serie.QueryPatt)
 		transformed.QueryCons = serie.QueryCons
-		transformed.Meta = serie.Meta.Copy()
+		transformed.Consolidator = serie.Consolidator
+		transformed.Meta = serie.Meta
 
-		for k, v := range serie.Tags {
-			transformed.Tags[k] = v
-		}
-		transformed.Tags["absolute"] = "1"
 		for _, p := range serie.Datapoints {
 			p.Val = math.Abs(p.Val)
 			transformed.Datapoints = append(transformed.Datapoints, p)
diff --git a/expr/func_aggregate.go b/expr/func_aggregate.go
@@ -50,10 +50,7 @@ func (s *FuncAggregate) Exec(cache map[Req][]models.Series) ([]models.Series, er
 
 	// The tags for the aggregated series is only the tags that are
 	// common to all input series
-	commonTags := make(map[string]string, len(series[0].Tags))
-	for k, v := range series[0].Tags {
-		commonTags[k] = v
-	}
+	commonTags := series[0].CopyTags()
 
 	var meta models.SeriesMeta
 
@@ -70,12 +67,12 @@ func (s *FuncAggregate) Exec(cache map[Req][]models.Series) ([]models.Series, er
 	name := s.agg.name + "Series(" + strings.Join(queryPatts, ",") + ")"
 	output := models.Series{
 		Target:       name,
-		QueryPatt:    name,
 		Tags:         commonTags,
 		Datapoints:   out,
 		Interval:     series[0].Interval,
-		Consolidator: cons,
+		QueryPatt:    name,
 		QueryCons:    queryCons,
+		Consolidator: cons,
 		Meta:         meta,
 	}
 	cache[Req{}] = append(cache[Req{}], output)
diff --git a/expr/func_alias.go b/expr/func_alias.go
@@ -32,7 +32,7 @@ func (s *FuncAlias) Exec(cache map[Req][]models.Series) ([]models.Series, error)
 	for i := range series {
 		series[i].Target = s.alias
 		series[i].QueryPatt = s.alias
-		series[i].Tags["name"] = s.alias
+		series[i].Tags = series[i].CopyTagsWith("name", s.alias)
 	}
 	return series, nil
 }
diff --git a/expr/func_aliasbynode.go b/expr/func_aliasbynode.go
@@ -33,7 +33,7 @@ func (s *FuncAliasByNode) Exec(cache map[Req][]models.Series) ([]models.Series,
 		n := aggKey(serie, s.nodes)
 		series[i].Target = n
 		series[i].QueryPatt = n
-		series[i].Tags["name"] = n
+		series[i].Tags = series[i].CopyTagsWith("name", n)
 	}
 	return series, nil
 }
diff --git a/expr/func_aliassub.go b/expr/func_aliassub.go
@@ -48,7 +48,7 @@ func (s *FuncAliasSub) Exec(cache map[Req][]models.Series) ([]models.Series, err
 		name := s.search.ReplaceAllString(metric, replace)
 		series[i].Target = name
 		series[i].QueryPatt = name
-		series[i].Tags["name"] = name
+		series[i].Tags = series[i].CopyTagsWith("name", name)
 	}
 	return series, err
 }
diff --git a/expr/func_aspercent.go b/expr/func_aspercent.go
@@ -119,6 +119,7 @@ func (s *FuncAsPercent) execWithNodes(series, totals []models.Series, cache map[
 				nonesSerie.QueryPatt = fmt.Sprintf("asPercent(%s,MISSING)", serie1.QueryPatt)
 				nonesSerie.Target = fmt.Sprintf("asPercent(%s,MISSING)", serie1.Target)
 				nonesSerie.Tags = map[string]string{"name": nonesSerie.Target}
+				nonesSerie.Meta = serie1.Meta.Copy()
 
 				if nones == nil {
 					nones = pointSlicePool.Get().([]schema.Point)
@@ -195,6 +196,7 @@ func (s *FuncAsPercent) execWithoutNodes(series, totals []models.Series, cache m
 			}
 			serie.Datapoints[i].Val = computeAsPercent(serie.Datapoints[i].Val, totalVal)
 		}
+		serie.Meta = serie.Meta.Merge(totalsSerie.Meta)
 		outSeries = append(outSeries, serie)
 		cache[Req{}] = append(cache[Req{}], serie)
 	}
@@ -247,9 +249,11 @@ func sumSeries(series []models.Series, cache map[Req][]models.Series) models.Ser
 	out := pointSlicePool.Get().([]schema.Point)
 	crossSeriesSum(series, &out)
 	var queryPatts []string
+	var meta models.SeriesMeta
 
 Loop:
 	for _, v := range series {
+		meta = meta.Merge(v.Meta)
 		// avoid duplicates
 		for _, qp := range queryPatts {
 			if qp == v.QueryPatt {
@@ -268,6 +272,7 @@ Loop:
 		Consolidator: cons,
 		QueryCons:    queryCons,
 		Tags:         map[string]string{"name": name},
+		Meta:         meta,
 	}
 	cache[Req{}] = append(cache[Req{}], sum)
 	return sum
diff --git a/expr/func_countseries.go b/expr/func_countseries.go
@@ -45,6 +45,11 @@ func (s *FuncCountSeries) Exec(cache map[Req][]models.Series) ([]models.Series,
 		out = append(out, p)
 	}
 
+	var meta models.SeriesMeta
+	for _, s := range series {
+		meta = meta.Merge(s.Meta)
+	}
+
 	output := models.Series{
 		Target:       name,
 		QueryPatt:    name,
@@ -53,6 +58,7 @@ func (s *FuncCountSeries) Exec(cache map[Req][]models.Series) ([]models.Series,
 		Interval:     series[0].Interval,
 		Consolidator: cons,
 		QueryCons:    queryCons,
+		Meta:         meta,
 	}
 	cache[Req{}] = append(cache[Req{}], output)
 
diff --git a/expr/func_derivative.go b/expr/func_derivative.go
@@ -34,16 +34,10 @@ func (s *FuncDerivative) Exec(cache map[Req][]models.Series) ([]models.Series, e
 	outSeries := make([]models.Series, len(series))
 	for i, serie := range series {
 		serie.Target = fmt.Sprintf("derivative(%s)", serie.Target)
+		serie.Tags = serie.CopyTagsWith("derivative", "1")
 		serie.QueryPatt = fmt.Sprintf("derivative(%s)", serie.QueryPatt)
 		out := pointSlicePool.Get().([]schema.Point)
 
-		newTags := make(map[string]string, len(serie.Tags)+1)
-		for k, v := range serie.Tags {
-			newTags[k] = v
-		}
-		newTags["derivative"] = "1"
-		serie.Tags = newTags
-
 		prev := math.NaN()
 		for _, p := range serie.Datapoints {
 			val := p.Val
diff --git a/expr/func_divideseries.go b/expr/func_divideseries.go
@@ -67,6 +67,7 @@ func (s *FuncDivideSeries) Exec(cache map[Req][]models.Series) ([]models.Series,
 			Interval:     divisor.Interval,
 			Consolidator: dividend.Consolidator,
 			QueryCons:    dividend.QueryCons,
+			Meta:         dividend.Meta.Copy().Merge(divisor.Meta),
 		}
 		cache[Req{}] = append(cache[Req{}], output)
 		series = append(series, output)
diff --git a/expr/func_divideserieslists.go b/expr/func_divideserieslists.go
@@ -67,6 +67,7 @@ func (s *FuncDivideSeriesLists) Exec(cache map[Req][]models.Series) ([]models.Se
 			Interval:     divisor.Interval,
 			Consolidator: dividend.Consolidator,
 			QueryCons:    dividend.QueryCons,
+			Meta:         dividend.Meta.Copy().Merge(divisor.Meta),
 		}
 		cache[Req{}] = append(cache[Req{}], output)
 		series = append(series, output)
diff --git a/expr/func_get.go b/expr/func_get.go
@@ -24,6 +24,8 @@ func (s FuncGet) Context(context Context) Context {
 func (s FuncGet) Exec(cache map[Req][]models.Series) ([]models.Series, error) {
 	series := cache[s.req]
 
+	// this function is the only exception to the COW pattern
+	// it is allowed to modify the series directly to set the needed tags
 	for k := range series {
 		series[k].SetTags()
 	}
diff --git a/expr/func_groupbytags.go b/expr/func_groupbytags.go
@@ -45,7 +45,11 @@ func (s *FuncGroupByTags) Exec(cache map[Req][]models.Series) ([]models.Series,
 		return nil, errors.New("No tags specified")
 	}
 
-	groups := make(map[string][]models.Series)
+	type Group struct {
+		s []models.Series
+		m models.SeriesMeta
+	}
+	groups := make(map[string]Group)
 	useName := false
 
 	groupTags := s.tags
@@ -99,28 +103,32 @@ func (s *FuncGroupByTags) Exec(cache map[Req][]models.Series) ([]models.Series,
 
 		key := buffer.String()
 
-		groups[key] = append(groups[key], serie)
+		group := groups[key]
+		group.s = append(group.s, serie)
+		group.m = group.m.Merge(serie.Meta)
+		groups[key] = group
 	}
 
 	output := make([]models.Series, 0, len(groups))
 	aggFunc := getCrossSeriesAggFunc(s.aggregator)
 
 	// Now, for each key perform the requested aggregation
-	for name, groupSeries := range groups {
-		cons, queryCons := summarizeCons(groupSeries)
+	for name, group := range groups {
+		cons, queryCons := summarizeCons(group.s)
 
 		newSeries := models.Series{
 			Target:       name,
 			QueryPatt:    name,
 			Interval:     series[0].Interval,
 			Consolidator: cons,
 			QueryCons:    queryCons,
+			Meta:         group.m,
 		}
 		newSeries.SetTags()
 
 		newSeries.Datapoints = pointSlicePool.Get().([]schema.Point)
 
-		aggFunc(groupSeries, &newSeries.Datapoints)
+		aggFunc(group.s, &newSeries.Datapoints)
 		cache[Req{}] = append(cache[Req{}], newSeries)
 
 		output = append(output, newSeries)
diff --git a/expr/func_integral.go b/expr/func_integral.go
@@ -35,17 +35,13 @@ func (s *FuncIntegral) Exec(cache map[Req][]models.Series) ([]models.Series, err
 	for i, serie := range series {
 		transformed := &out[i]
 		transformed.Target = fmt.Sprintf("integral(%s)", serie.Target)
-		transformed.QueryPatt = fmt.Sprintf("integral(%s)", serie.QueryPatt)
-		transformed.Tags = make(map[string]string, len(serie.Tags)+1)
+		transformed.Tags = serie.CopyTagsWith("integral", "1")
 		transformed.Datapoints = pointSlicePool.Get().([]schema.Point)
+		transformed.QueryPatt = fmt.Sprintf("integral(%s)", serie.QueryPatt)
 		transformed.Interval = serie.Interval
 		transformed.Consolidator = serie.Consolidator
 		transformed.QueryCons = serie.QueryCons
-
-		for k, v := range serie.Tags {
-			transformed.Tags[k] = v
-		}
-		transformed.Tags["integral"] = "1"
+		transformed.Meta = serie.Meta
 
 		current := 0.0
 		for _, p := range serie.Datapoints {
diff --git a/expr/func_isnonnull.go b/expr/func_isnonnull.go
@@ -36,16 +36,13 @@ func (s *FuncIsNonNull) Exec(cache map[Req][]models.Series) ([]models.Series, er
 		transformed := &out[i]
 		transformed.Target = fmt.Sprintf("isNonNull(%s)", serie.Target)
 		transformed.QueryPatt = fmt.Sprintf("isNonNull(%s)", serie.QueryPatt)
-		transformed.Tags = make(map[string]string, len(serie.Tags)+1)
+		transformed.Tags = serie.CopyTagsWith("isNonNull", "1")
 		transformed.Datapoints = pointSlicePool.Get().([]schema.Point)
 		transformed.Interval = serie.Interval
 		transformed.Consolidator = serie.Consolidator
 		transformed.QueryCons = serie.QueryCons
+		transformed.Meta = serie.Meta.Copy()
 
-		for k, v := range serie.Tags {
-			transformed.Tags[k] = v
-		}
-		transformed.Tags["isNonNull"] = "1"
 		for _, p := range serie.Datapoints {
 			if math.IsNaN(p.Val) {
 				p.Val = 0
diff --git a/expr/func_keeplastvalue.go b/expr/func_keeplastvalue.go
@@ -46,10 +46,12 @@ func (s *FuncKeepLastValue) Exec(cache map[Req][]models.Series) ([]models.Series
 	}
 	limit := int(s.limit)
 	outSeries := make([]models.Series, len(series))
-	for i, serie := range series {
-		serie.Target = fmt.Sprintf("keepLastValue(%s)", serie.Target)
+	for i, in := range series {
+		serie := in.CopyBare()
+		serie.Target = fmt.Sprintf("keepLastValue(%s)", in.Target)
 		serie.QueryPatt = serie.Target
-
+		serie.Tags = in.Tags
+		serie.Meta = in.Meta
 		out := pointSlicePool.Get().([]schema.Point)
 
 		var consecutiveNaNs int
diff --git a/expr/func_nonnegativederivative.go b/expr/func_nonnegativederivative.go
diff --git a/expr/func_persecond.go b/expr/func_persecond.go
diff --git a/expr/funcs.go b/expr/funcs.go

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ func (s *FuncAlias) Exec(cache map[Req][]models.Series) ([]models.Series, error)`
`32`	`32`	`for i := range series {`
`33`	`33`	`series[i].Target = s.alias`
`34`	`34`	`series[i].QueryPatt = s.alias`
`35`		`- series[i].Tags["name"] = s.alias`
	`35`	`+ series[i].Tags = series[i].CopyTagsWith("name", s.alias)`
`36`	`36`	`}`
`37`	`37`	`return series, nil`
`38`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ func (s *FuncAliasByNode) Exec(cache map[Req][]models.Series) ([]models.Series,`
`33`	`33`	`n := aggKey(serie, s.nodes)`
`34`	`34`	`series[i].Target = n`
`35`	`35`	`series[i].QueryPatt = n`
`36`		`- series[i].Tags["name"] = n`
	`36`	`+ series[i].Tags = series[i].CopyTagsWith("name", n)`
`37`	`37`	`}`
`38`	`38`	`return series, nil`
`39`	`39`	`}`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ func (s *FuncAliasSub) Exec(cache map[Req][]models.Series) ([]models.Series, err`
`48`	`48`	`name := s.search.ReplaceAllString(metric, replace)`
`49`	`49`	`series[i].Target = name`
`50`	`50`	`series[i].QueryPatt = name`
`51`		`- series[i].Tags["name"] = name`
	`51`	`+ series[i].Tags = series[i].CopyTagsWith("name", name)`
`52`	`52`	`}`
`53`	`53`	`return series, err`
`54`	`54`	`}`
Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,7 @@ func (s *FuncDivideSeries) Exec(cache map[Req][]models.Series) ([]models.Series,`
`67`	`67`	`Interval: divisor.Interval,`
`68`	`68`	`Consolidator: dividend.Consolidator,`
`69`	`69`	`QueryCons: dividend.QueryCons,`
	`70`	`+ Meta: dividend.Meta.Copy().Merge(divisor.Meta),`
`70`	`71`	`}`
`71`	`72`	`cache[Req{}] = append(cache[Req{}], output)`
`72`	`73`	`series = append(series, output)`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,8 @@ func (s FuncGet) Context(context Context) Context {`
`24`	`24`	`func (s FuncGet) Exec(cache map[Req][]models.Series) ([]models.Series, error) {`
`25`	`25`	`series := cache[s.req]`
`26`	`26`
	`27`	`+ // this function is the only exception to the COW pattern`
	`28`	`+ // it is allowed to modify the series directly to set the needed tags`
`27`	`29`	`for k := range series {`
`28`	`30`	`series[k].SetTags()`
`29`	`31`	`}`