Skip to content

Commit

Permalink
stats: limit the number of top-n items (#11906)
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx authored and sre-bot committed Aug 28, 2019
1 parent c599333 commit ab8346e
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 37 deletions.
46 changes: 21 additions & 25 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,32 +65,27 @@ func NewCMSketch(d, w int32) *CMSketch {
// topNHelper wraps some variables used when building cmsketch with top n.
type topNHelper struct {
sampleSize uint64
counter map[hack.MutableString]uint64
sorted []uint64
sorted []dataCnt
onlyOnceItems uint64
sumTopN uint64
lastVal uint64
actualNumTop uint32
}

func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper {
counter := make(map[hack.MutableString]uint64)
for i := range sample {
counter[hack.String(sample[i])]++
}
sorted, onlyOnceItems := make([]uint64, 0, len(counter)), uint64(0)
for _, cnt := range counter {
sorted = append(sorted, cnt)
sorted, onlyOnceItems := make([]dataCnt, 0, len(counter)), uint64(0)
for key, cnt := range counter {
sorted = append(sorted, dataCnt{hack.Slice(string(key)), cnt})
if cnt == 1 {
onlyOnceItems++
}
}
sort.Slice(sorted, func(i, j int) bool {
return sorted[i] > sorted[j]
})
sort.SliceStable(sorted, func(i, j int) bool { return sorted[i].cnt > sorted[j].cnt })

var (
// last is the last element in top N index should occurres atleast `last` times.
last uint64
sumTopN uint64
sampleNDV = uint32(len(sorted))
)
Expand All @@ -99,18 +94,18 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper {
// frequency of the n-th element are added to the TopN statistics. We chose
// 2/3 as an empirical value because the average cardinality estimation
// error is relatively small compared with 1/2.
for i := uint32(0); i < sampleNDV && i < numTop*2; i++ {
if i >= numTop && sorted[i]*3 < sorted[numTop-1]*2 && last != sorted[i] {
var actualNumTop uint32
for ; actualNumTop < sampleNDV && actualNumTop < numTop*2; actualNumTop++ {
if actualNumTop >= numTop && sorted[actualNumTop].cnt*3 < sorted[numTop-1].cnt*2 {
break
}
if sorted[i] == 1 {
if sorted[actualNumTop].cnt == 1 {
break
}
last = sorted[i]
sumTopN += sorted[i]
sumTopN += sorted[actualNumTop].cnt
}

return &topNHelper{uint64(len(sample)), counter, sorted, onlyOnceItems, sumTopN, last}
return &topNHelper{uint64(len(sample)), sorted, onlyOnceItems, sumTopN, actualNumTop}
}

// NewCMSketchWithTopN returns a new CM sketch with TopN elements, the estimate NDV and the scale ratio.
Expand All @@ -130,22 +125,23 @@ func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, default
enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN
if enableTopN {
c.topN = make(map[uint64][]*TopNMeta)
for i := uint32(0); i < helper.actualNumTop; i++ {
data, cnt := helper.sorted[i].data, helper.sorted[i].cnt
h1, h2 := murmur3.Sum128(data)
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt * scaleRatio})
}
helper.sorted = helper.sorted[helper.actualNumTop:]
}
c.defaultValue = defaultVal
for counterKey, cnt := range helper.counter {
data := hack.Slice(string(counterKey))
for i := range helper.sorted {
data, cnt := helper.sorted[i].data, helper.sorted[i].cnt
// If the value only occurred once in the sample, we assumes that there is no difference with
// value that does not occurred in the sample.
rowCount := defaultVal
if cnt > 1 {
rowCount = cnt * scaleRatio
}
if enableTopN && cnt >= helper.lastVal {
h1, h2 := murmur3.Sum128(data)
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount})
} else {
c.insertBytesByCount(data, rowCount)
}
c.insertBytesByCount(data, rowCount)
}
return
}
Expand Down
1 change: 1 addition & 0 deletions statistics/cmsketch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) {
for _, t := range tests {
lSketch, lMap, err := buildCMSketchTopNAndMap(d, w, 20, 1000, 0, total, imax, t.zipfFactor)
c.Check(err, IsNil)
c.Assert(len(lSketch.TopN()), LessEqual, 40)
avg, err := averageAbsoluteError(lSketch, lMap)
c.Assert(err, IsNil)
c.Check(avg, LessEqual, t.avgError)
Expand Down
17 changes: 5 additions & 12 deletions statistics/sample.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
"github.com/pingcap/tidb/sessionctx/stmtctx"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/hack"
"github.com/pingcap/tidb/util/sqlexec"
"github.com/pingcap/tipb/go-tipb"
"github.com/spaolacci/murmur3"
Expand Down Expand Up @@ -272,19 +271,13 @@ func (c *SampleCollector) ExtractTopN(numTop uint32) {
helper := newTopNHelper(values, numTop)
cms := c.CMSketch
cms.topN = make(map[uint64][]*TopNMeta)
dataCnts := make([]dataCnt, 0, len(helper.counter))
for key, cnt := range helper.counter {
if cnt >= helper.lastVal {
dataCnts = append(dataCnts, dataCnt{hack.Slice(string(key)), cnt})
}
}
// Sort them decreasingly so we can handle most frequent values first and reduce the probability of hash collision
// Process them decreasingly so we can handle most frequent values first and reduce the probability of hash collision
// by small values.
sort.SliceStable(dataCnts, func(i, j int) bool { return dataCnts[i].cnt >= dataCnts[j].cnt })
for _, dc := range dataCnts {
h1, h2 := murmur3.Sum128(dc.data)
for i := uint32(0); i < helper.actualNumTop; i++ {
data := helper.sorted[i].data
h1, h2 := murmur3.Sum128(data)
realCnt := cms.queryHashValue(h1, h2)
cms.subValue(h1, h2, realCnt)
cms.topN[h1] = append(cms.topN[h1], &TopNMeta{h2, dc.data, realCnt})
cms.topN[h1] = append(cms.topN[h1], &TopNMeta{h2, data, realCnt})
}
}

0 comments on commit ab8346e

Please sign in to comment.