-
Notifications
You must be signed in to change notification settings - Fork 12
/
Stats.cpp
186 lines (157 loc) · 4.75 KB
/
Stats.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#include "Stats.h"
//-----------------------------------------------------------------------------
double chooseK ( int n, int k )
{
if(k > (n - k)) k = n - k;
double c = 1;
for(int i = 0; i < k; i++)
{
double t = double(n-i) / double(i+1);
c *= t;
}
return c;
}
double chooseUpToK ( int n, int k )
{
double c = 0;
for(int i = 1; i <= k; i++)
{
c += chooseK(n,i);
}
return c;
}
//-----------------------------------------------------------------------------
// Calculate a modified version of the RMS ratio for a distribution.
//
// This function is the "old" calcScore, which was not well explained.
//
// We know that the equation:
//
// r = ( sum * sum ) / sum_squares / count
//
// results in r=1 when the distribution is "perfect", and produces
// slightly smaller
//
// And the further away from a perfect distribution the smaller the
// ratio.
//
// This function modifies that to
//
// r = ( sum * sum - 1 ) / (sum_squares - sum) / count
//
// which means a perfect mapping gets a score of above 1.
//
// Honestly I don't understand this really. If I compare the g-test probability
// on something this check would say is "bad", the g-test may be conclusive
// and vice versa. I have a lot more trust in the g-test than in this code.
double calcScore_old ( const int * vals, const int count, const int sum)
{
double sum_squares = 0.0;
for(int i = 0; i < count; i++)
{
sum_squares += pow(double(vals[i]),2.0);
}
double square_sum= pow(double(sum), 2.0);
return 1.0 - ( ( square_sum - 1) / (sum_squares - double(sum)) / count );
}
//-----------------------------------------------------------------------------
// Calculate the likelihood that a given distribution is random, and if it is
// return the RMS ratio for it.
double calcScore ( const int * vals, const int icount, const int isum, double confidence )
{
double sum= double(isum);
double count= double(icount);
double expected = sum / count;
double gtest = 0.0;
double quality_score = 0.0;
double old_score = 0.0;
double stddev = 0.0;
int min = vals[0];
int max = vals[0];
std::vector<int> count_counts(expected * 4, 0);
for(int i = 0; i < count; i++)
{
double iv= vals[i];
double v= double(iv);
if (iv >= count_counts.size())
count_counts.resize(iv * 2,0);
count_counts[iv]++;
if (min > iv) min = iv;
if (max < iv) max = iv;
GTEST_ADD(gtest, v, expected);
quality_score += ( v * ( v + 1.0 ) ) / 2.0;
old_score += pow( v, 2.0 );
stddev += pow( v - expected , 2.0 );
}
gtest = GTEST_PROB(count, gtest);
stddev = sqrt( stddev / count );
old_score = 1.0 - ( ( pow( sum, 2.0 ) - 1.0 ) / ( old_score - sum ) / count );
quality_score = quality_score / ( ( sum / ( 2.0 * count ) ) *
( sum + ( 2.0 * count ) - 1.0 ) );
double score = fabs( 1.0 - quality_score );
if (gtest >= confidence && score > 0.01) {
if (g_verbose) {
printf(
"Failed gtest with %.6f prob\n"
" old_score = %.6f%% score = %.6f (%.6f)\n"
" bins=%.0f sum=%.0f mean=%.0f stddev=%.6f\n",
gtest * 100, old_score * 100, score, quality_score,
count, sum, expected, stddev);
printf(" Bucket Count Frequencies:\n");
for ( int i = min ; i <= max ; i++ )
printf(" %3d = %10d (%6.2f)\n",
i, count_counts[i], count_counts[i] / sum * 100.0 );
}
return score;
} else {
if (g_verbose > 1) {
printf(
"Passed gtest with %.6f prob\n"
" old_score = %.6f%% score = %.6f (%.6f)\n"
" bins=%.0f sum=%.0f mean=%.0f stddev=%.6f\n",
gtest * 100, old_score * 100, score, quality_score,
count, sum, expected, stddev);
}
return 0;
}
}
//-----------------------------------------------------------------------------
// Calculate the likelihood that a given distribution is random.
//
double calcGTestProbability ( const int * vals, const int count, const int sum )
{
double expected = double(sum) / double(count);
double gtest = 0.0;
for(int i = 0; i < count; i++)
{
GTEST_ADD(gtest,vals[i],expected);
}
return GTEST_PROB(count, gtest);
}
double calcGTestProbability ( const int * vals, const int count)
{
int sum = 0;
for (int i = 0; i < count; i++) {
sum += vals[i];
}
return calcGTestProbability(vals, count, sum);
}
//----------------------------------------------------------------------------
void plot ( double n )
{
double n2 = n * 1;
if(n2 < 0) n2 = 0;
n2 *= 100;
if(n2 > 64) n2 = 64;
int n3 = (int)n2;
if(n3 == 0)
printf(".");
else
{
char x = '0' + char(n3);
if(x > '9') x = 'X';
printf("%c",x);
}
}
//-----------------------------------------------------------------------------
/* vim: set sts=2 sw=2 et: */