Skip to content

Commit

Permalink
stats: limit the number of top-n items (pingcap#11906)
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx committed Aug 28, 2019
1 parent 7e5b9a7 commit 1f6d0b4
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 24 deletions.
49 changes: 25 additions & 24 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,35 +61,35 @@ func NewCMSketch(d, w int32) *CMSketch {
return &CMSketch{depth: d, width: w, table: tbl}
}

type dataCnt struct {
data []byte
cnt uint64
}

// topNHelper wraps some variables used when building cmsketch with top n.
type topNHelper struct {
sampleSize uint64
counter map[hack.MutableString]uint64
sorted []uint64
sorted []dataCnt
onlyOnceItems uint64
sumTopN uint64
lastVal uint64
actualNumTop uint32
}

func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper {
counter := make(map[hack.MutableString]uint64)
for i := range sample {
counter[hack.String(sample[i])]++
}
sorted, onlyOnceItems := make([]uint64, 0, len(counter)), uint64(0)
for _, cnt := range counter {
sorted = append(sorted, cnt)
sorted, onlyOnceItems := make([]dataCnt, 0, len(counter)), uint64(0)
for key, cnt := range counter {
sorted = append(sorted, dataCnt{hack.Slice(string(key)), cnt})
if cnt == 1 {
onlyOnceItems++
}
}
sort.Slice(sorted, func(i, j int) bool {
return sorted[i] > sorted[j]
})
sort.SliceStable(sorted, func(i, j int) bool { return sorted[i].cnt > sorted[j].cnt })

var (
// last is the last element in top N index should occurres atleast `last` times.
last uint64
sumTopN uint64
sampleNDV = uint32(len(sorted))
)
Expand All @@ -98,15 +98,15 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper {
// frequency of the n-th element are added to the TopN statistics. We chose
// 2/3 as an empirical value because the average cardinality estimation
// error is relatively small compared with 1/2.
for i := uint32(0); i < sampleNDV && i < numTop*2; i++ {
if i >= numTop && sorted[i]*3 < sorted[numTop-1]*2 && last != sorted[i] {
var actualNumTop uint32
for ; actualNumTop < sampleNDV && actualNumTop < numTop*2; actualNumTop++ {
if actualNumTop >= numTop && sorted[actualNumTop].cnt*3 < sorted[numTop-1].cnt*2 {
break
}
last = sorted[i]
sumTopN += sorted[i]
sumTopN += sorted[actualNumTop].cnt
}

return &topNHelper{uint64(len(sample)), counter, sorted, onlyOnceItems, sumTopN, last}
return &topNHelper{uint64(len(sample)), sorted, onlyOnceItems, sumTopN, actualNumTop}
}

// NewCMSketchWithTopN returns a new CM sketch with TopN elements, the estimate NDV and the scale ratio.
Expand All @@ -126,22 +126,23 @@ func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, default
enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN
if enableTopN {
c.topN = make(map[uint64][]*TopNMeta)
for i := uint32(0); i < helper.actualNumTop; i++ {
data, cnt := helper.sorted[i].data, helper.sorted[i].cnt
h1, h2 := murmur3.Sum128(data)
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt * scaleRatio})
}
helper.sorted = helper.sorted[helper.actualNumTop:]
}
c.defaultValue = defaultVal
for counterKey, cnt := range helper.counter {
data := hack.Slice(string(counterKey))
for i := range helper.sorted {
data, cnt := helper.sorted[i].data, helper.sorted[i].cnt
// If the value only occurred once in the sample, we assumes that there is no difference with
// value that does not occurred in the sample.
rowCount := defaultVal
if cnt > 1 {
rowCount = cnt * scaleRatio
}
if enableTopN && cnt >= helper.lastVal {
h1, h2 := murmur3.Sum128(data)
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount})
} else {
c.insertBytesByCount(data, rowCount)
}
c.insertBytesByCount(data, rowCount)
}
return
}
Expand Down
1 change: 1 addition & 0 deletions statistics/cmsketch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) {
for _, t := range tests {
lSketch, lMap, err := buildCMSketchTopNAndMap(d, w, 20, 1000, 0, total, imax, t.zipfFactor)
c.Check(err, IsNil)
c.Assert(len(lSketch.TopN()), LessEqual, 40)
avg, err := averageAbsoluteError(lSketch, lMap)
c.Assert(err, IsNil)
c.Check(avg, LessEqual, t.avgError)
Expand Down

0 comments on commit 1f6d0b4

Please sign in to comment.