From 1869db04d9e174a663e2422bc1d20812beb3c25e Mon Sep 17 00:00:00 2001 From: Laurent Demailly Date: Tue, 4 Jul 2017 10:30:15 -0700 Subject: [PATCH] Improve stats accuracy at low percentile And corresponding test --- test/Stats_test.cpp | 6 ++++++ util/Stats.cpp | 19 +++++++++++++------ util/Stats.h | 8 +++++++- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/test/Stats_test.cpp b/test/Stats_test.cpp index 5d7e3334..cde09c31 100644 --- a/test/Stats_test.cpp +++ b/test/Stats_test.cpp @@ -94,7 +94,13 @@ TEST(StatsTest, HistogramPercentile) { std::cout << i << "% at " << h.calcPercentile(i) << std::endl; } EXPECT_EQ(501, h.getAverage()); + EXPECT_EQ(1, h.calcPercentile(-1)); + EXPECT_EQ(1, h.calcPercentile(0)); + EXPECT_EQ(1.045, h.calcPercentile(0.1)); + EXPECT_EQ(1.45, h.calcPercentile(1)); + EXPECT_EQ(2.8, h.calcPercentile(4)); EXPECT_EQ(10, h.calcPercentile(20)); + EXPECT_EQ(250.25, h.calcPercentile(20.1)); EXPECT_EQ(550, h.calcPercentile(50)); EXPECT_EQ(775, h.calcPercentile(75)); EXPECT_EQ(1000.5, h.calcPercentile(90)); diff --git a/util/Stats.cpp b/util/Stats.cpp index 1c88e573..9a8b5e32 100644 --- a/util/Stats.cpp +++ b/util/Stats.cpp @@ -49,18 +49,20 @@ int32_t* initHistLookup() { static const int32_t* kVal2Bucket = initHistLookup(); void Counter::record(int64_t v) { - int64_t newCount = ++count_; // atomic_add(count_, 1); - // atomic_add(sum_, v); + int64_t newCount = ++count_; sum_ += v; if (newCount == 1) { - min_ = max_ = v; + realMin_ = min_ = max_ = v; } - // this can actually miss/revert the min/max (race condition) - // don't count 0 in the min - makes for a slightly more interesting min + // don't count 0s in the min - makes for a slightly more interesting min // in most cases if (v && ((min_ == 0) || (v < min_))) { min_ = v; } + // real min (can be 0): + if (v < realMin_ ) { + realMin_ = v; + } if (v > max_) { max_ = v; } @@ -176,7 +178,7 @@ double Histogram::calcPercentile(double percentile) const { return getMax(); } if (percentile <= 0) { - return 0; + return getRealMin(); } // Initial value of prev should in theory be offset_ // but if the data is wrong (smaller than offset - eg 'negative') that @@ -185,6 +187,7 @@ double Histogram::calcPercentile(double percentile) const { int64_t total = 0; const int64_t ctrTotal = getCount(); const int64_t ctrMax = getMax(); + const int64_t ctrMin = getRealMin(); double prevPerc = 0; double perc = 0; bool found = false; @@ -213,6 +216,10 @@ double Histogram::calcPercentile(double percentile) const { cur = ctrMax; perc = 100.; // can't be removed } + // Fix up the min too to never return < min and increase low p accuracy + if (prev < ctrMin) { + prev = ctrMin; + } return (prev + (percentile - prevPerc) * (cur - prev) / (perc - prevPerc)); } diff --git a/util/Stats.h b/util/Stats.h index df0770f0..b752e801 100644 --- a/util/Stats.h +++ b/util/Stats.h @@ -145,6 +145,7 @@ class Counter : crashifcopied { public: explicit Counter(double printScale = 1.0) : count_(0), + realMin_(0), min_(0), max_(0), sum_(0), @@ -161,6 +162,11 @@ class Counter : crashifcopied { return count_; } + // Includes 0s + inline int64_t getRealMin() const { + return realMin_; + } + // Excludes 0 inline int64_t getMin() const { return min_; } @@ -209,7 +215,7 @@ class Counter : crashifcopied { void merge(const Counter& c); private: - int64_t count_, min_, max_, sum_, sumOfSquares_; + int64_t count_, realMin_, min_, max_, sum_, sumOfSquares_; double printScale_; };