From 1f6d0b41dad11d778f3e0084cc2d064ecde0d22f Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Wed, 28 Aug 2019 18:23:08 +0900 Subject: [PATCH] stats: limit the number of top-n items (#11906) --- statistics/cmsketch.go | 49 +++++++++++++++++++------------------ statistics/cmsketch_test.go | 1 + 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 4842941e5af75..9d14ed5d96f6d 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -61,14 +61,18 @@ func NewCMSketch(d, w int32) *CMSketch { return &CMSketch{depth: d, width: w, table: tbl} } +type dataCnt struct { + data []byte + cnt uint64 +} + // topNHelper wraps some variables used when building cmsketch with top n. type topNHelper struct { sampleSize uint64 - counter map[hack.MutableString]uint64 - sorted []uint64 + sorted []dataCnt onlyOnceItems uint64 sumTopN uint64 - lastVal uint64 + actualNumTop uint32 } func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper { @@ -76,20 +80,16 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper { for i := range sample { counter[hack.String(sample[i])]++ } - sorted, onlyOnceItems := make([]uint64, 0, len(counter)), uint64(0) - for _, cnt := range counter { - sorted = append(sorted, cnt) + sorted, onlyOnceItems := make([]dataCnt, 0, len(counter)), uint64(0) + for key, cnt := range counter { + sorted = append(sorted, dataCnt{hack.Slice(string(key)), cnt}) if cnt == 1 { onlyOnceItems++ } } - sort.Slice(sorted, func(i, j int) bool { - return sorted[i] > sorted[j] - }) + sort.SliceStable(sorted, func(i, j int) bool { return sorted[i].cnt > sorted[j].cnt }) var ( - // last is the last element in top N index should occurres atleast `last` times. - last uint64 sumTopN uint64 sampleNDV = uint32(len(sorted)) ) @@ -98,15 +98,15 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper { // frequency of the n-th element are added to the TopN statistics. We chose // 2/3 as an empirical value because the average cardinality estimation // error is relatively small compared with 1/2. - for i := uint32(0); i < sampleNDV && i < numTop*2; i++ { - if i >= numTop && sorted[i]*3 < sorted[numTop-1]*2 && last != sorted[i] { + var actualNumTop uint32 + for ; actualNumTop < sampleNDV && actualNumTop < numTop*2; actualNumTop++ { + if actualNumTop >= numTop && sorted[actualNumTop].cnt*3 < sorted[numTop-1].cnt*2 { break } - last = sorted[i] - sumTopN += sorted[i] + sumTopN += sorted[actualNumTop].cnt } - return &topNHelper{uint64(len(sample)), counter, sorted, onlyOnceItems, sumTopN, last} + return &topNHelper{uint64(len(sample)), sorted, onlyOnceItems, sumTopN, actualNumTop} } // NewCMSketchWithTopN returns a new CM sketch with TopN elements, the estimate NDV and the scale ratio. @@ -126,22 +126,23 @@ func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, default enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN if enableTopN { c.topN = make(map[uint64][]*TopNMeta) + for i := uint32(0); i < helper.actualNumTop; i++ { + data, cnt := helper.sorted[i].data, helper.sorted[i].cnt + h1, h2 := murmur3.Sum128(data) + c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt * scaleRatio}) + } + helper.sorted = helper.sorted[helper.actualNumTop:] } c.defaultValue = defaultVal - for counterKey, cnt := range helper.counter { - data := hack.Slice(string(counterKey)) + for i := range helper.sorted { + data, cnt := helper.sorted[i].data, helper.sorted[i].cnt // If the value only occurred once in the sample, we assumes that there is no difference with // value that does not occurred in the sample. rowCount := defaultVal if cnt > 1 { rowCount = cnt * scaleRatio } - if enableTopN && cnt >= helper.lastVal { - h1, h2 := murmur3.Sum128(data) - c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount}) - } else { - c.insertBytesByCount(data, rowCount) - } + c.insertBytesByCount(data, rowCount) } return } diff --git a/statistics/cmsketch_test.go b/statistics/cmsketch_test.go index 7e4261af328f1..ab5b1e3b0b858 100644 --- a/statistics/cmsketch_test.go +++ b/statistics/cmsketch_test.go @@ -189,6 +189,7 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) { for _, t := range tests { lSketch, lMap, err := buildCMSketchTopNAndMap(d, w, 20, 1000, 0, total, imax, t.zipfFactor) c.Check(err, IsNil) + c.Assert(len(lSketch.TopN()), LessEqual, 40) avg, err := averageAbsoluteError(lSketch, lMap) c.Assert(err, IsNil) c.Check(avg, LessEqual, t.avgError)