grafana · Dieterbe · Oct 11, 2018 · Jul 29, 2018 · Jul 29, 2018 · Jul 29, 2018
diff --git a/api/cluster.go b/api/cluster.go
@@ -294,7 +294,7 @@ func (s *Server) peerQuery(ctx context.Context, data cluster.Traceable, name, pa
 	result := make(map[string]PeerResponse)
 	for resp := range responses {
 		if resp.err != nil {
-			return nil, err
+			return nil, resp.err
 		}
 		result[resp.data.peer.GetName()] = resp.data
 	}
@@ -303,112 +303,144 @@ func (s *Server) peerQuery(ctx context.Context, data cluster.Traceable, name, pa
 }
 
 // peerQuerySpeculative takes a request and the path to request it on, then fans it out
-// across the cluster, except to the local peer. If any peer fails requests to
-// other peers are aborted. If enough peers have been heard from (based on
-// speculation-threshold configuration), and we are missing the others, try to
-// speculatively query each other member of each shard group.
+// across the cluster. If any peer fails requests to other peers are aborted. If enough
+// peers have been heard from (based on speculation-threshold configuration), and we
+// are missing the others, try to speculatively query each other member of each shard group.
 // ctx:          request context
 // data:         request to be submitted
 // name:         name to be used in logging & tracing
 // path:         path to request on
 func (s *Server) peerQuerySpeculative(ctx context.Context, data cluster.Traceable, name, path string) (map[string]PeerResponse, error) {
-	peerGroups, err := cluster.MembersForSpeculativeQuery()
-	if err != nil {
-		log.Errorf("HTTP peerQuery unable to get peers, %s", err.Error())
-		return nil, err
-	}
-	log.Debugf("HTTP %s across %d instances", name, len(peerGroups)-1)
+	result := make(map[string]PeerResponse)
 
-	reqCtx, cancel := context.WithCancel(ctx)
-	defer cancel()
+	responseChan, errorChan := s.peerQuerySpeculativeChan(ctx, data, name, path)
 
-	originalPeers := make(map[string]struct{}, len(peerGroups))
-	receivedResponses := make(map[int32]struct{}, len(peerGroups))
+	for resp := range responseChan {
+		result[resp.peer.GetName()] = resp
+	}
 
-	responses := make(chan struct {
-		shardGroup int32
-		data       PeerResponse
-		err        error
-	}, 1)
+	err := <-errorChan
+	return result, err
+}
 
-	askPeer := func(shardGroup int32, peer cluster.Node) {
-		log.Debugf("HTTP Render querying %s%s", peer.GetName(), path)
-		buf, err := peer.Post(reqCtx, name, path, data)
+// peerQuerySpeculativeChan takes a request and the path to request it on, then fans it out
+// across the cluster. If any peer fails requests to other peers are aborted. If enough
+// peers have been heard from (based on speculation-threshold configuration), and we
+// are missing the others, try to speculatively query other members of the shard group.
+// ctx:          request context
+// data:         request to be submitted
+// name:         name to be used in logging & tracing
+// path:         path to request on
+// resultChan:   channel to put responses on as they come in
+func (s *Server) peerQuerySpeculativeChan(ctx context.Context, data cluster.Traceable, name, path string) (<-chan PeerResponse, <-chan error) {
+	resultChan := make(chan PeerResponse)
+	errorChan := make(chan error, 1)
 
-		select {
-		case <-ctx.Done():
-			return
-		default:
-			// Not canceled, continue
-		}
+	go func() {
+		defer close(errorChan)
+		defer close(resultChan)
 
+		peerGroups, err := cluster.MembersForSpeculativeQuery()
 		if err != nil {
-			cancel()
-			log.Errorf("HTTP Render error querying %s%s: %q", peer.GetName(), path, err.Error())
+			log.Errorf("HTTP peerQuery unable to get peers, %s", err.Error())
+			errorChan <- err
+			return
 		}
-		responses <- struct {
+		log.Debugf("HTTP %s across %d instances", name, len(peerGroups)-1)
+
+		reqCtx, cancel := context.WithCancel(ctx)
+		defer cancel()
+
+		originalPeers := make(map[string]struct{}, len(peerGroups))
+		receivedResponses := make(map[int32]struct{}, len(peerGroups))
+
+		responses := make(chan struct {
 			shardGroup int32
 			data       PeerResponse
 			err        error
-		}{shardGroup, PeerResponse{peer, buf}, err}
-	}
+		}, 1)
 
-	for group, peers := range peerGroups {
-		peer := peers[0]
-		originalPeers[peer.GetName()] = struct{}{}
-		go askPeer(group, peer)
-	}
-
-	result := make(map[string]PeerResponse)
-
-	var ticker *time.Ticker
-	var tickChan <-chan time.Time
-	if speculationThreshold != 1 {
-		ticker = time.NewTicker(5 * time.Millisecond)
-		tickChan = ticker.C
-		defer ticker.Stop()
-	}
+		askPeer := func(shardGroup int32, peer cluster.Node) {
+			log.Debugf("HTTP Render querying %s%s", peer.GetName(), path)
+			buf, err := peer.Post(reqCtx, name, path, data)
 
-	for len(receivedResponses) < len(peerGroups) {
-		select {
-		case resp := <-responses:
-			if _, ok := receivedResponses[resp.shardGroup]; ok {
-				// already received this response (possibly speculatively)
-				continue
+			select {
+			case <-ctx.Done():
+				return
+			default:
+				// Not canceled, continue
 			}
 
-			if resp.err != nil {
-				return nil, err
+			if err != nil {
+				cancel()
+				log.Errorf("HTTP Render error querying %s%s: %q", peer.GetName(), path, err)
 			}
+			responses <- struct {
+				shardGroup int32
+				data       PeerResponse
+				err        error
+			}{shardGroup, PeerResponse{peer, buf}, err}
+		}
 
-			result[resp.data.peer.GetName()] = resp.data
-			receivedResponses[resp.shardGroup] = struct{}{}
-			delete(originalPeers, resp.data.peer.GetName())
-
-		case <-tickChan:
-			// Check if it's time to speculate!
-			percentReceived := float64(len(receivedResponses)) / float64(len(peerGroups))
-			if percentReceived >= speculationThreshold {
-				// kick off speculative queries to other members now
-				ticker.Stop()
-				speculativeAttempts.Inc()
-				for shardGroup, peers := range peerGroups {
-					if _, ok := receivedResponses[shardGroup]; ok {
-						continue
-					}
-					eligiblePeers := peers[1:]
-					for _, peer := range eligiblePeers {
-						speculativeRequests.Inc()
-						go askPeer(shardGroup, peer)
+		for group, peers := range peerGroups {
+			peer := peers[0]
+			originalPeers[peer.GetName()] = struct{}{}
+			go askPeer(group, peer)
+		}
+
+		var ticker *time.Ticker
+		var tickChan <-chan time.Time
+		if speculationThreshold != 1 {
+			ticker = time.NewTicker(5 * time.Millisecond)
+			tickChan = ticker.C
+			defer ticker.Stop()
+		}
+
+		for len(receivedResponses) < len(peerGroups) {
+			select {
+			case <-ctx.Done():
+				//request canceled
+				return
+			case resp := <-responses:
+				if _, ok := receivedResponses[resp.shardGroup]; ok {
+					// already received this response (possibly speculatively)
+					continue
+				}
+
+				if resp.err != nil {
+					errorChan <- resp.err
+					return
+				}
+
+				resultChan <- resp.data
+				receivedResponses[resp.shardGroup] = struct{}{}
+				delete(originalPeers, resp.data.peer.GetName())
+
+			case <-tickChan:
+				// Check if it's time to speculate!
+				percentReceived := float64(len(receivedResponses)) / float64(len(peerGroups))
+				if percentReceived >= speculationThreshold {
+					// kick off speculative queries to other members now
+					ticker.Stop()
+					speculativeAttempts.Inc()
+					for shardGroup, peers := range peerGroups {
+						if _, ok := receivedResponses[shardGroup]; ok {
+							continue
+						}
+						eligiblePeers := peers[1:]
+						for _, peer := range eligiblePeers {
+							speculativeRequests.Inc()
+							go askPeer(shardGroup, peer)
+						}
 					}
 				}
 			}
 		}
-	}
 
-	if len(originalPeers) > 0 {
-		speculativeWins.Inc()
-	}
+		if len(originalPeers) > 0 {
+			speculativeWins.Inc()
+		}
+	}()
 
-	return result, nil
+	return resultChan, errorChan
 }
diff --git a/api/config.go b/api/config.go
@@ -15,6 +15,7 @@ import (
 var (
 	maxPointsPerReqSoft int
 	maxPointsPerReqHard int
+	maxSeriesPerReq     int
 	logMinDurStr        string
 	logMinDur           uint32
 
@@ -39,6 +40,7 @@ func ConfigSetup() {
 	apiCfg := flag.NewFlagSet("http", flag.ExitOnError)
 	apiCfg.IntVar(&maxPointsPerReqSoft, "max-points-per-req-soft", 1000000, "lower resolution rollups will be used to try and keep requests below this number of datapoints. (0 disables limit)")
 	apiCfg.IntVar(&maxPointsPerReqHard, "max-points-per-req-hard", 20000000, "limit of number of datapoints a request can return. Requests that exceed this limit will be rejected. (0 disables limit)")
+	apiCfg.IntVar(&maxSeriesPerReq, "max-series-per-req", 250000, "limit of number of series a request can operate on. Requests that exceed this limit will be rejected. (0 disables limit)")
 	apiCfg.StringVar(&logMinDurStr, "log-min-dur", "5min", "only log incoming requests if their timerange is at least this duration. Use 0 to disable")
 
 	apiCfg.StringVar(&Addr, "listen", ":6060", "http listener address.")

diff --git a/api/graphite.go b/api/graphite.go
@@ -3,6 +3,7 @@ package api
 import (
 	"context"
 	"errors"
+	"fmt"
 	"math"
 	"net/http"
 	"sort"
@@ -607,7 +608,7 @@ func (s *Server) executePlan(ctx context.Context, orgId uint32, plan expr.Plan)
 			for i, e := range exprs {
 				exprs[i] = strings.Trim(e, " '\"")
 			}
-			series, err = s.clusterFindByTag(ctx, orgId, exprs, int64(r.From))
+			series, err = s.clusterFindByTag(ctx, orgId, exprs, int64(r.From), maxSeriesPerReq-len(reqs))
 		} else {
 			series, err = s.findSeries(ctx, orgId, []string{r.Query}, int64(r.From))
 		}
@@ -798,7 +799,7 @@ func (s *Server) clusterTagDetails(ctx context.Context, orgId uint32, tag, filte
 
 func (s *Server) graphiteTagFindSeries(ctx *middleware.Context, request models.GraphiteTagFindSeries) {
 	reqCtx := ctx.Req.Context()
-	series, err := s.clusterFindByTag(reqCtx, ctx.OrgId, request.Expr, request.From)
+	series, err := s.clusterFindByTag(reqCtx, ctx.OrgId, request.Expr, request.From, maxSeriesPerReq)
 	if err != nil {
 		response.Write(ctx, response.WrapError(err))
 		return
@@ -818,28 +819,30 @@ func (s *Server) graphiteTagFindSeries(ctx *middleware.Context, request models.G
 	response.Write(ctx, response.NewJson(200, seriesNames, ""))
 }
 
-func (s *Server) clusterFindByTag(ctx context.Context, orgId uint32, expressions []string, from int64) ([]Series, error) {
+func (s *Server) clusterFindByTag(ctx context.Context, orgId uint32, expressions []string, from int64, maxSeries int) ([]Series, error) {
 	data := models.IndexFindByTag{OrgId: orgId, Expr: expressions, From: from}
-	resps, err := s.peerQuerySpeculative(ctx, data, "clusterFindByTag", "/index/find_by_tag")
-	if err != nil {
-		return nil, err
-	}
-
-	select {
-	case <-ctx.Done():
-		//request canceled
-		return nil, nil
-	default:
-	}
+	newCtx, cancel := context.WithCancel(ctx)
+	responseChan, errorChan := s.peerQuerySpeculativeChan(newCtx, data, "clusterFindByTag", "/index/find_by_tag")
 
 	var allSeries []Series
 
-	for _, r := range resps {
+	for r := range responseChan {
 		resp := models.IndexFindByTagResp{}
-		_, err = resp.UnmarshalMsg(r.buf)
+		_, err := resp.UnmarshalMsg(r.buf)
 		if err != nil {
+			cancel()
 			return nil, err
 		}
+
+		// 0 disables the check, so only check if maxSeriesPerReq > 0
+		if maxSeriesPerReq > 0 && len(resp.Metrics)+len(allSeries) > maxSeries {
+			cancel()
+			return nil,
+				response.NewError(
+					http.StatusRequestEntityTooLarge,
+					fmt.Sprintf("Request exceeds max-series-per-req limit (%d). Reduce the number of targets or ask your admin to increase the limit.", maxSeriesPerReq))
+		}
+
 		for _, series := range resp.Metrics {
 			allSeries = append(allSeries, Series{
 				Pattern: series.Path,
@@ -849,7 +852,8 @@ func (s *Server) clusterFindByTag(ctx context.Context, orgId uint32, expressions
 		}
 	}
 
-	return allSeries, nil
+	err := <-errorChan
+	return allSeries, err
 }
 
 func (s *Server) graphiteTags(ctx *middleware.Context, request models.GraphiteTags) {

diff --git a/api/prometheus_querier.go b/api/prometheus_querier.go
@@ -64,7 +64,7 @@ func (q *querier) Select(matchers ...*labels.Matcher) (storage.SeriesSet, error)
 		}
 	}
 
-	series, err := q.clusterFindByTag(q.ctx, q.OrgID, expressions, 0)
+	series, err := q.clusterFindByTag(q.ctx, q.OrgID, expressions, 0, maxSeriesPerReq)
 	if err != nil {
 		return nil, err
 	}

diff --git a/cluster/node.go b/cluster/node.go
@@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"net/http"
 	"time"
@@ -202,7 +203,8 @@ func (n HTTPNode) GetName() string {
 func handleResp(rsp *http.Response) ([]byte, error) {
 	defer rsp.Body.Close()
 	if rsp.StatusCode != 200 {
-		ioutil.ReadAll(rsp.Body)
+		// Read in body so that the connection can be reused
+		io.Copy(ioutil.Discard, rsp.Body)
 		return nil, NewError(rsp.StatusCode, fmt.Errorf(rsp.Status))
 	}
 	return ioutil.ReadAll(rsp.Body)

diff --git a/docker/docker-chaos/metrictank.ini b/docker/docker-chaos/metrictank.ini
@@ -153,6 +153,8 @@ key-file = /etc/ssl/private/ssl-cert-snakeoil.key
 max-points-per-req-soft = 1000000
 # limit of number of datapoints a request can return. Requests that exceed this limit will be rejected. (0 disables limit)
 max-points-per-req-hard = 20000000
+# limit of number of series a request can operate on. Requests that exceed this limit will be rejected. (0 disables limit)
+max-series-per-req = 250000
 # require x-org-id authentication to auth as a specific org. otherwise orgId 1 is assumed
 multi-tenant = true
 # in case our /render endpoint does not support the requested processing, proxy the request to this graphite

diff --git a/docker/docker-cluster/metrictank.ini b/docker/docker-cluster/metrictank.ini
@@ -153,6 +153,8 @@ key-file = /etc/ssl/private/ssl-cert-snakeoil.key
 max-points-per-req-soft = 1000000
 # limit of number of datapoints a request can return. Requests that exceed this limit will be rejected. (0 disables limit)
 max-points-per-req-hard = 20000000
+# limit of number of series a request can operate on. Requests that exceed this limit will be rejected. (0 disables limit)
+max-series-per-req = 250000
 # require x-org-id authentication to auth as a specific org. otherwise orgId 1 is assumed
 multi-tenant = true
 # in case our /render endpoint does not support the requested processing, proxy the request to this graphite

diff --git a/docker/docker-dev-custom-cfg-kafka/metrictank.ini b/docker/docker-dev-custom-cfg-kafka/metrictank.ini
@@ -153,6 +153,8 @@ key-file = /etc/ssl/private/ssl-cert-snakeoil.key
 max-points-per-req-soft = 1000000
 # limit of number of datapoints a request can return. Requests that exceed this limit will be rejected. (0 disables limit)
 max-points-per-req-hard = 20000000
+# limit of number of series a request can operate on. Requests that exceed this limit will be rejected. (0 disables limit)
+max-series-per-req = 250000
 # require x-org-id authentication to auth as a specific org. otherwise orgId 1 is assumed
 multi-tenant = true
 # in case our /render endpoint does not support the requested processing, proxy the request to this graphite

diff --git a/docs/config.md b/docs/config.md
@@ -194,6 +194,8 @@ key-file = /etc/ssl/private/ssl-cert-snakeoil.key
 max-points-per-req-soft = 1000000
 # limit of number of datapoints a request can return. Requests that exceed this limit will be rejected. (0 disables limit)
 max-points-per-req-hard = 20000000
+# limit of number of series a request can operate on. Requests that exceed this limit will be rejected. (0 disables limit)
+max-series-per-req = 250000
 # require x-org-id authentication to auth as a specific org. otherwise orgId 1 is assumed
 multi-tenant = true
 # in case our /render endpoint does not support the requested processing, proxy the request to this graphite

diff --git a/metrictank-sample.ini b/metrictank-sample.ini
@@ -156,6 +156,8 @@ key-file = /etc/ssl/private/ssl-cert-snakeoil.key
 max-points-per-req-soft = 1000000
 # limit of number of datapoints a request can return. Requests that exceed this limit will be rejected. (0 disables limit)
 max-points-per-req-hard = 20000000
+# limit of number of series a request can operate on. Requests that exceed this limit will be rejected. (0 disables limit)
+max-series-per-req = 250000
 # require x-org-id authentication to auth as a specific org. otherwise orgId 1 is assumed
 multi-tenant = true
 # in case our /render endpoint does not support the requested processing, proxy the request to this graphite

diff --git a/scripts/config/metrictank-docker.ini b/scripts/config/metrictank-docker.ini
@@ -153,6 +153,8 @@ key-file = /etc/ssl/private/ssl-cert-snakeoil.key
 max-points-per-req-soft = 1000000
 # limit of number of datapoints a request can return. Requests that exceed this limit will be rejected. (0 disables limit)
 max-points-per-req-hard = 20000000
+# limit of number of series a request can operate on. Requests that exceed this limit will be rejected. (0 disables limit)
+max-series-per-req = 250000
 # require x-org-id authentication to auth as a specific org. otherwise orgId 1 is assumed
 multi-tenant = true
 # in case our /render endpoint does not support the requested processing, proxy the request to this graphite

diff --git a/scripts/config/metrictank-package.ini b/scripts/config/metrictank-package.ini
@@ -153,6 +153,8 @@ key-file = /etc/ssl/private/ssl-cert-snakeoil.key
 max-points-per-req-soft = 1000000
 # limit of number of datapoints a request can return. Requests that exceed this limit will be rejected. (0 disables limit)
 max-points-per-req-hard = 20000000
+# limit of number of series a request can operate on. Requests that exceed this limit will be rejected. (0 disables limit)
+max-series-per-req = 250000
 # require x-org-id authentication to auth as a specific org. otherwise orgId 1 is assumed
 multi-tenant = true
 # in case our /render endpoint does not support the requested processing, proxy the request to this graphite