grafana
diff --git a/‎api/ccache.go
+1-1 b/‎api/ccache.go
+1-1
diff --git a/‎api/cluster.go
+38-18 b/‎api/cluster.go
+38-18
diff --git a/‎api/dataprocessor.go
+1-1 b/‎api/dataprocessor.go
+1-1
diff --git a/‎api/graphite.go
+45-16 b/‎api/graphite.go
+45-16
diff --git a/‎api/models/node.go
+1 b/‎api/models/node.go
+1
diff --git a/‎cmd-dev/mt-simulate-memory-idx-lock-contention/runner/test_runner.go
+1-1 b/‎cmd-dev/mt-simulate-memory-idx-lock-contention/runner/test_runner.go
+1-1
diff --git a/‎idx/bigtable/bigtable.go
+2-2 b/‎idx/bigtable/bigtable.go
+2-2
diff --git a/‎idx/cassandra/cassandra.go
+2-2 b/‎idx/cassandra/cassandra.go
+2-2
@@ -49,7 +49,7 @@ func (s *Server) ccacheDelete(ctx *middleware.Context, req models.CCacheDelete)
 		var toClear []idx.Node
 		if len(req.Patterns) > 0 {
 			for _, pattern := range req.Patterns {
-				nodes, err := s.MetricIndex.Find(req.OrgId, pattern, 0)
+				nodes, err := s.MetricIndex.Find(req.OrgId, pattern, 0, 0)
 				if err != nil {
 					res.AddError(err)
 					code = http.StatusInternalServerError
 
@@ -128,7 +128,7 @@ func (s *Server) indexFind(ctx *middleware.Context, req models.IndexFind) {
 	}
 
 	for _, pattern := range req.Patterns {
-		nodes, err := s.MetricIndex.Find(req.OrgId, pattern, req.From)
+		nodes, err := s.MetricIndex.Find(req.OrgId, pattern, req.From, req.Limit)
 		if err != nil {
 			response.Write(ctx, response.WrapError(err))
 			return
@@ -471,20 +471,18 @@ func (s *Server) queryAllPeers(ctx context.Context, data cluster.Traceable, name
 	return result, errors
 }
 
-// queryAllShards takes a request and the path to request it on, then fans it out
+// queryAllShards takes a function and calls it for one peer in each shard
 // across the cluster. If any peer fails, we try another replica. If enough
 // peers have been heard from (based on speculation-threshold configuration), and we
 // are missing the others, try to speculatively query other members of the shard group.
+// all responses are collected and returned at once.
 // ctx:          request context
-// data:         request to be submitted
 // name:         name to be used in logging & tracing
-// path:         path to request on
-func (s *Server) queryAllShards(ctx context.Context, data cluster.Traceable, name, path string) (map[string]PeerResponse, error) {
+// fetchFunc:    function to call to fetch the data from a peer
+func (s *Server) queryAllShards(ctx context.Context, name string, fetchFn fetchFunc) (map[string]PeerResponse, error) {
 	result := make(map[string]PeerResponse)
 
-	responseChan, errorChan := s.queryAllShardsGeneric(ctx, name, func(reqCtx context.Context, peer cluster.Node) (interface{}, error) {
-		return peer.Post(reqCtx, name, path, data)
-	})
+	responseChan, errorChan := s.queryAllShardsGeneric(ctx, name, fetchFn)
 
 	for resp := range responseChan {
 		result[resp.peer.GetName()] = PeerResponse{
@@ -501,9 +499,11 @@ func (s *Server) queryAllShards(ctx context.Context, data cluster.Traceable, nam
 // across the cluster. If any peer fails, we try another replica. If enough
 // peers have been heard from (based on speculation-threshold configuration), and we
 // are missing the others, try to speculatively query other members of the shard group.
+// all responses and errors are streamed through the returned channels
 // ctx:          request context
+// name:         name to be used in logging & tracing
 // fetchFunc:    function to call to fetch the data from a peer
-func (s *Server) queryAllShardsGeneric(ctx context.Context, name string, fetchFunc func(context.Context, cluster.Node) (interface{}, error)) (<-chan GenericPeerResponse, <-chan error) {
+func (s *Server) queryAllShardsGeneric(ctx context.Context, name string, fetchFn fetchFunc) (<-chan GenericPeerResponse, <-chan error) {
 	peerGroups, err := cluster.MembersForSpeculativeQuery()
 	if err != nil {
 		log.Errorf("HTTP peerQuery unable to get peers, %s", err.Error())
@@ -513,10 +513,22 @@ func (s *Server) queryAllShardsGeneric(ctx context.Context, name string, fetchFu
 		return resultChan, errorChan
 	}
 
-	return queryPeers(ctx, peerGroups, name, fetchFunc)
+	return queryPeers(ctx, peerGroups, name, fetchFn)
 }
 
-type fetchFunc func(context.Context, cluster.Node) (interface{}, error)
+// fetchFunc is a function to query the given cluster.Node
+// the list of all nodes in the cluster is passed as well for additional context
+// Example: fetchFunc can use this to determine the ratio of how much data the target peer owns
+// compared to the cluster as a whole.  Caveat: this is based on live cluster state. If shardgroups
+// go completely down it'll look like the target peer owns more of the cluster than it actually does.
+// if query limits are set based on this, the limits would loosen up as shards leave the cluster.
+type fetchFunc func(context.Context, cluster.Node, map[int32][]cluster.Node) (interface{}, error)
+
+func fetchFuncPost(data cluster.Traceable, name, path string) fetchFunc {
+	return func(reqCtx context.Context, peer cluster.Node, peerGroups map[int32][]cluster.Node) (interface{}, error) {
+		return peer.Post(reqCtx, name, path, data)
+	}
+}
 
 type shardResponse struct {
 	shardGroup int32
@@ -532,20 +544,20 @@ type shardState struct {
 }
 
 // AskPeer issues the query on the next peer, if available, and returns it
-func (state *shardState) AskPeer(ctx context.Context, fn fetchFunc, responses chan shardResponse) (cluster.Node, bool) {
+func (state *shardState) AskPeer(ctx context.Context, peerGroups map[int32][]cluster.Node, fn fetchFunc, responses chan shardResponse) (cluster.Node, bool) {
 	if len(state.remainingPeers) == 0 {
 		return nil, false
 	}
 	peer := state.remainingPeers[0]
 	state.remainingPeers = state.remainingPeers[1:]
 	state.inflight++
-	go state.askPeer(ctx, peer, fn, responses)
+	go state.askPeer(ctx, peerGroups, peer, fn, responses)
 	return peer, true
 }
 
-func (state *shardState) askPeer(ctx context.Context, peer cluster.Node, fetchFn fetchFunc, responses chan shardResponse) {
+func (state *shardState) askPeer(ctx context.Context, peerGroups map[int32][]cluster.Node, peer cluster.Node, fetchFn fetchFunc, responses chan shardResponse) {
 	//log.Debugf("HTTP Render querying %s%s", peer.GetName(), path)
-	resp, err := fetchFn(ctx, peer)
+	resp, err := fetchFn(ctx, peer, peerGroups)
 	select {
 	case <-ctx.Done():
 		return
@@ -593,7 +605,7 @@ func queryPeers(ctx context.Context, peerGroups map[int32][]cluster.Node, name s
 				shard:          shard,
 				remainingPeers: peers,
 			}
-			peer, _ := state.AskPeer(reqCtx, fetchFn, responses) // thanks to the above check we always know there was a peer available
+			peer, _ := state.AskPeer(reqCtx, peerGroups, fetchFn, responses) // thanks to the above check we always know there was a peer available
 			originalPeers[peer.GetName()] = struct{}{}
 			states[shard] = state
 		}
@@ -621,8 +633,16 @@ func queryPeers(ctx context.Context, peerGroups map[int32][]cluster.Node, name s
 				}
 
 				if resp.err != nil {
+					if resp.err.Error() == "400 Bad Request" {
+						// if we got bad request, then retrying it on a different replica will result in the same
+						// Cancel the reqCtx, which will cancel all in-flight requests.
+						cancel()
+						errorChan <- resp.err
+						return
+					}
+
 					// if we can try another peer for this shardGroup, do it
-					_, ok := states[resp.shardGroup].AskPeer(reqCtx, fetchFn, responses)
+					_, ok := states[resp.shardGroup].AskPeer(reqCtx, peerGroups, fetchFn, responses)
 					if ok {
 						speculativeRequests.Inc()
 						continue
@@ -656,7 +676,7 @@ func queryPeers(ctx context.Context, peerGroups map[int32][]cluster.Node, name s
 							continue
 						}
 
-						if _, ok := states[shardGroup].AskPeer(specCtx, fetchFn, responses); ok {
+						if _, ok := states[shardGroup].AskPeer(specCtx, peerGroups, fetchFn, responses); ok {
 							speculativeRequests.Inc()
 						}
 					}
 
@@ -239,7 +239,7 @@ func (s *Server) getTargetsRemote(ctx context.Context, ss *models.StorageStats,
 	rCtx, cancel := context.WithCancel(ctx)
 	defer cancel()
 
-	resultChan, errorChan := queryPeers(rCtx, requiredPeers, "getTargetsRemote", func(ctx context.Context, node cluster.Node) (interface{}, error) {
+	resultChan, errorChan := queryPeers(rCtx, requiredPeers, "getTargetsRemote", func(ctx context.Context, node cluster.Node, peerGroups map[int32][]cluster.Node) (interface{}, error) {
 		var resp models.GetDataRespV1
 		reqs, ok := shardReqs[node.GetPartitions()[0]]
 		if !ok {
 
@@ -100,14 +100,29 @@ type Series struct {
 	Node    cluster.Node
 }
 
-func (s *Server) findSeries(ctx context.Context, orgId uint32, patterns []string, seenAfter int64) ([]Series, error) {
-	data := models.IndexFind{
-		Patterns: patterns,
-		OrgId:    orgId,
-		From:     seenAfter,
+func (s *Server) findSeries(ctx context.Context, orgId uint32, patterns []string, seenAfter int64, maxSeries int) ([]Series, error) {
+
+	fetchFn := func(reqCtx context.Context, peer cluster.Node, peerGroups map[int32][]cluster.Node) (interface{}, error) {
+		ourParts := len(peer.GetPartitions())
+
+		// assign a fractional maxSeries limit (not global, but relative to how much data the peer has)
+		// look at each shardgroup and check how many partitions it has
+		// (we assume each shardgroup is consistent across different peers for that shardgroup)
+		var totalParts int
+		for _, otherPeers := range peerGroups {
+			if len(otherPeers) > 0 {
+				totalParts += len(otherPeers[0].GetPartitions())
+			}
+		}
+		data := models.IndexFind{
+			Patterns: patterns,
+			OrgId:    orgId,
+			From:     seenAfter,
+			Limit:    int64(maxSeries * ourParts / totalParts),
+		}
+		return peer.Post(reqCtx, "findSeriesRemote", "/index/find", data)
 	}
-
-	resps, err := s.queryAllShards(ctx, data, "findSeriesRemote", "/index/find")
+	resps, err := s.queryAllShards(ctx, "findSeriesRemote", fetchFn)
 	if err != nil {
 		return nil, err
 	}
@@ -121,12 +136,23 @@ func (s *Server) findSeries(ctx context.Context, orgId uint32, patterns []string
 	series := make([]Series, 0)
 	resp := models.IndexFindResp{}
 	for _, r := range resps {
+		if len(series) == maxSeries {
+			return nil, response.NewError(
+				http.StatusRequestEntityTooLarge,
+				fmt.Sprintf("Request exceeds max-series-per-req limit (%d). Reduce the number of targets or ask your admin to increase the limit.", maxSeriesPerReq))
+		}
 		_, err = resp.UnmarshalMsg(r.buf)
 		if err != nil {
 			return nil, err
 		}
 
 		for pattern, nodes := range resp.Nodes {
+			if len(series) == maxSeries {
+				return nil, response.NewError(
+					http.StatusRequestEntityTooLarge,
+					fmt.Sprintf("Request exceeds max-series-per-req limit (%d). Reduce the number of targets or ask your admin to increase the limit.", maxSeriesPerReq))
+			}
+
 			series = append(series, Series{
 				Pattern: pattern,
 				Node:    r.peer,
@@ -330,7 +356,7 @@ func (s *Server) metricsFind(ctx *middleware.Context, request models.GraphiteFin
 	}
 	nodes := make([]idx.Node, 0)
 	reqCtx := ctx.Req.Context()
-	series, err := s.findSeries(reqCtx, ctx.OrgId, []string{request.Query}, int64(fromUnix))
+	series, err := s.findSeries(reqCtx, ctx.OrgId, []string{request.Query}, int64(fromUnix), maxSeriesPerReq)
 	if err != nil {
 		response.Write(ctx, response.WrapError(err))
 		return
@@ -378,7 +404,7 @@ func (s *Server) metricsExpand(ctx *middleware.Context, request models.GraphiteE
 	for i, query := range request.Query {
 		i, query := i, query
 		g.Go(func() error {
-			series, err := s.findSeries(errGroupCtx, ctx.OrgId, []string{query}, 0)
+			series, err := s.findSeries(errGroupCtx, ctx.OrgId, []string{query}, 0, maxSeriesPerReq)
 			if err != nil {
 				return err
 			}
@@ -795,7 +821,10 @@ func (s *Server) executePlan(ctx context.Context, orgId uint32, plan *expr.Plan)
 			}
 			series, err = s.clusterFindByTag(ctx, orgId, exprs, int64(r.From), maxSeriesPerReq-int(reqs.cnt), false)
 		} else {
-			series, err = s.findSeries(ctx, orgId, []string{r.Query}, int64(r.From))
+			// find limit is the limit minus what we already consumed for other targets, adjusted for how many rawReqs we folded into this resolveSeriesRequest
+			// note that this doesn't account for duplicate requests like target=foo&target=foo because those are both represented by the same rawReq (see NewPlan)
+			findLimit := (maxSeriesPerReq - int(reqs.cnt)) / len(rawReqs)
+			series, err = s.findSeries(ctx, orgId, []string{r.Query}, int64(r.From), findLimit)
 		}
 		if err != nil {
 			return nil, meta, err
@@ -1046,7 +1075,7 @@ func (s *Server) clusterTagDetails(ctx context.Context, orgId uint32, tag, filte
 	result := make(map[string]uint64)
 
 	data := models.IndexTagDetails{OrgId: orgId, Tag: tag, Filter: filter}
-	resps, err := s.queryAllShards(ctx, data, "clusterTagDetails", "/index/tag_details")
+	resps, err := s.queryAllShards(ctx, "clusterTagDetails", fetchFuncPost(data, "clusterTagDetails", "/index/tag_details"))
 	if err != nil {
 		return nil, err
 	}
@@ -1147,7 +1176,7 @@ func (s *Server) clusterFindByTag(ctx context.Context, orgId uint32, expressions
 	newCtx, cancel := context.WithCancel(ctx)
 	defer cancel()
 	responseChan, errorChan := s.queryAllShardsGeneric(newCtx, "clusterFindByTag",
-		func(reqCtx context.Context, peer cluster.Node) (interface{}, error) {
+		func(reqCtx context.Context, peer cluster.Node, peerGroups map[int32][]cluster.Node) (interface{}, error) {
 			resp := models.IndexFindByTagResp{}
 			body, err := peer.PostRaw(reqCtx, "clusterFindByTag", "/index/find_by_tag", data)
 			if body == nil || err != nil {
@@ -1222,7 +1251,7 @@ func (s *Server) graphiteTags(ctx *middleware.Context, request models.GraphiteTa
 
 func (s *Server) clusterTags(ctx context.Context, orgId uint32, filter string) ([]string, error) {
 	data := models.IndexTags{OrgId: orgId, Filter: filter}
-	resps, err := s.queryAllShards(ctx, data, "clusterTags", "/index/tags")
+	resps, err := s.queryAllShards(ctx, "clusterTags", fetchFuncPost(data, "clusterTags", "/index/tags"))
 	if err != nil {
 		return nil, err
 	}
@@ -1274,7 +1303,7 @@ func (s *Server) clusterAutoCompleteTags(ctx context.Context, orgId uint32, pref
 	tagSet := make(map[string]struct{})
 
 	data := models.IndexAutoCompleteTags{OrgId: orgId, Prefix: prefix, Expr: expressions, Limit: limit}
-	responses, err := s.queryAllShards(ctx, data, "clusterAutoCompleteTags", "/index/tags/autoComplete/tags")
+	responses, err := s.queryAllShards(ctx, "clusterAutoCompleteTags", fetchFuncPost(data, "clusterAutoCompleteTags", "/index/tags/autoComplete/tags"))
 	if err != nil {
 		return nil, err
 	}
@@ -1321,7 +1350,7 @@ func (s *Server) clusterAutoCompleteTagValues(ctx context.Context, orgId uint32,
 	valSet := make(map[string]struct{})
 
 	data := models.IndexAutoCompleteTagValues{OrgId: orgId, Tag: tag, Prefix: prefix, Expr: expressions, Limit: limit}
-	responses, err := s.queryAllShards(ctx, data, "clusterAutoCompleteValues", "/index/tags/autoComplete/values")
+	responses, err := s.queryAllShards(ctx, "clusterAutoCompleteValues", fetchFuncPost(data, "clusterAutoCompleteValues", "/index/tags/autoComplete/values"))
 	if err != nil {
 		return nil, err
 	}
@@ -1352,7 +1381,7 @@ func (s *Server) clusterAutoCompleteTagValues(ctx context.Context, orgId uint32,
 
 func (s *Server) graphiteTagTerms(ctx *middleware.Context, request models.GraphiteTagTerms) {
 	data := models.IndexTagTerms{OrgId: ctx.OrgId, Tags: request.Tags, Expr: request.Expr}
-	responses, err := s.queryAllShards(ctx.Req.Context(), data, "graphiteTagTerms", "/index/tags/terms")
+	responses, err := s.queryAllShards(ctx.Req.Context(), "graphiteTagTerms", fetchFuncPost(data, "graphiteTagTerms", "/index/tags/terms"))
 	if err != nil {
 		response.Write(ctx, response.WrapErrorForTagDB(err))
 		return
 
@@ -166,6 +166,7 @@ type IndexFind struct {
 	Patterns []string `json:"patterns" form:"patterns" binding:"Required"`
 	OrgId    uint32   `json:"orgId" form:"orgId" binding:"Required"`
 	From     int64    `json:"from" form:"from"`
+	Limit    int64    `json:"limit"`
 }
 
 func (i IndexFind) Trace(span opentracing.Span) {
 
@@ -222,7 +222,7 @@ func (t *TestRun) runQuery(pattern string, wg *sync.WaitGroup, active chan struc
 	}()
 	pre := time.Now()
 	active <- struct{}{}
-	_, err := t.index.Find(orgID, pattern, 0)
+	_, err := t.index.Find(orgID, pattern, 0, 0)
 	if err != nil {
 		log.Printf("Warning: Query failed with error: %s", err)
 	}
 
@@ -260,15 +260,15 @@ func (b *BigtableIdx) updateBigtable(now uint32, inMemory bool, archive idx.Arch
 	return archive
 }
 
-func (b *BigtableIdx) Find(orgId uint32, pattern string, from int64) ([]idx.Node, error) {
+func (b *BigtableIdx) Find(orgId uint32, pattern string, from, limit int64) ([]idx.Node, error) {
 	// The lastUpdate timestamp does not get updated in the bigtable index every time when
 	// a data point is received, there can be a delay of up to b.cfg.updateInterval32. To
 	// avoid falsely excluding a metric based on its lastUpdate timestamp we offset the
 	// from time by updateInterval32, this way we err on the "too inclusive" side
 	if from > int64(b.cfg.updateInterval32) {
 		from -= int64(b.cfg.updateInterval32)
 	}
-	return b.MemoryIndex.Find(orgId, pattern, from)
+	return b.MemoryIndex.Find(orgId, pattern, from, limit)
 }
 
 func (b *BigtableIdx) rebuildIndex() {
 
@@ -297,15 +297,15 @@ func (c *CasIdx) updateCassandra(now uint32, inMemory bool, archive idx.Archive,
 	return archive
 }
 
-func (c *CasIdx) Find(orgId uint32, pattern string, from int64) ([]idx.Node, error) {
+func (c *CasIdx) Find(orgId uint32, pattern string, from, limit int64) ([]idx.Node, error) {
 	// The lastUpdate timestamp does not get updated in the cassandra index every time when
 	// a data point is received, there can be a delay of up to c.updateInterval32. To avoid
 	// falsely excluding a metric based on its lastUpdate timestamp we offset the from time
 	// by updateInterval32, this way we err on the "too inclusive" side
 	if from > int64(c.updateInterval32) {
 		from -= int64(c.updateInterval32)
 	}
-	return c.MemoryIndex.Find(orgId, pattern, from)
+	return c.MemoryIndex.Find(orgId, pattern, from, limit)
 }
 
 func (c *CasIdx) rebuildIndex() {
Original file line number	Diff line number	Diff line change
`@@ -166,6 +166,7 @@ type IndexFind struct {`
`166`	`166`	Patterns []string `json:"patterns" form:"patterns" binding:"Required"`
`167`	`167`	OrgId uint32 `json:"orgId" form:"orgId" binding:"Required"`
`168`	`168`	From int64 `json:"from" form:"from"`
	`169`	+ Limit int64 `json:"limit"`
`169`	`170`	`}`
`170`	`171`
`171`	`172`	`func (i IndexFind) Trace(span opentracing.Span) {`
Original file line number	Diff line number	Diff line change
`@@ -222,7 +222,7 @@ func (t TestRun) runQuery(pattern string, wg sync.WaitGroup, active chan struc`
`222`	`222`	`}()`
`223`	`223`	`pre := time.Now()`
`224`	`224`	`active <- struct{}{}`
`225`		`- _, err := t.index.Find(orgID, pattern, 0)`
	`225`	`+ _, err := t.index.Find(orgID, pattern, 0, 0)`
`226`	`226`	`if err != nil {`
`227`	`227`	`log.Printf("Warning: Query failed with error: %s", err)`
`228`	`228`	`}`
Original file line number	Diff line number	Diff line change
`@@ -260,15 +260,15 @@ func (b *BigtableIdx) updateBigtable(now uint32, inMemory bool, archive idx.Arch`
`260`	`260`	`return archive`
`261`	`261`	`}`
`262`	`262`
`263`		`-func (b *BigtableIdx) Find(orgId uint32, pattern string, from int64) ([]idx.Node, error) {`
	`263`	`+func (b *BigtableIdx) Find(orgId uint32, pattern string, from, limit int64) ([]idx.Node, error) {`
`264`	`264`	`// The lastUpdate timestamp does not get updated in the bigtable index every time when`
`265`	`265`	`// a data point is received, there can be a delay of up to b.cfg.updateInterval32. To`
`266`	`266`	`// avoid falsely excluding a metric based on its lastUpdate timestamp we offset the`
`267`	`267`	`// from time by updateInterval32, this way we err on the "too inclusive" side`
`268`	`268`	`if from > int64(b.cfg.updateInterval32) {`
`269`	`269`	`from -= int64(b.cfg.updateInterval32)`
`270`	`270`	`}`
`271`		`- return b.MemoryIndex.Find(orgId, pattern, from)`
	`271`	`+ return b.MemoryIndex.Find(orgId, pattern, from, limit)`
`272`	`272`	`}`
`273`	`273`
`274`	`274`	`func (b *BigtableIdx) rebuildIndex() {`
Original file line number	Diff line number	Diff line change
`@@ -297,15 +297,15 @@ func (c *CasIdx) updateCassandra(now uint32, inMemory bool, archive idx.Archive,`
`297`	`297`	`return archive`
`298`	`298`	`}`
`299`	`299`
`300`		`-func (c *CasIdx) Find(orgId uint32, pattern string, from int64) ([]idx.Node, error) {`
	`300`	`+func (c *CasIdx) Find(orgId uint32, pattern string, from, limit int64) ([]idx.Node, error) {`
`301`	`301`	`// The lastUpdate timestamp does not get updated in the cassandra index every time when`
`302`	`302`	`// a data point is received, there can be a delay of up to c.updateInterval32. To avoid`
`303`	`303`	`// falsely excluding a metric based on its lastUpdate timestamp we offset the from time`
`304`	`304`	`// by updateInterval32, this way we err on the "too inclusive" side`
`305`	`305`	`if from > int64(c.updateInterval32) {`
`306`	`306`	`from -= int64(c.updateInterval32)`
`307`	`307`	`}`
`308`		`- return c.MemoryIndex.Find(orgId, pattern, from)`
	`308`	`+ return c.MemoryIndex.Find(orgId, pattern, from, limit)`
`309`	`309`	`}`
`310`	`310`
`311`	`311`	`func (c *CasIdx) rebuildIndex() {`