-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_rng.cpp
141 lines (110 loc) · 3.78 KB
/
test_rng.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include "fast_rng.hpp"
#include "rng_helpers.hpp"
#include <iostream>
#include <unistd.h>
using namespace std;
// Helper function for debug
inline void print_vec(float *a)
{
for (int i=0; i < 8; i++)
cout << a[i] << " ";
cout << "\n\n";
}
// Randomized unit test comparing scalar and vectorized implementations
inline bool test_xorshift(int niter=10000)
{
random_device rd;
uint64_t rn1 = rng_helpers::rd64(rd);
uint64_t rn2 = rng_helpers::rd64(rd);
uint64_t rn3 = rng_helpers::rd64(rd);
uint64_t rn4 = rng_helpers::rd64(rd);
uint64_t rn5 = rng_helpers::rd64(rd);
uint64_t rn6 = rng_helpers::rd64(rd);
uint64_t rn7 = rng_helpers::rd64(rd);
uint64_t rn8 = rng_helpers::rd64(rd);
fast_rng::vec_xorshift_plus a(_mm256_setr_epi64x(rn1, rn3, rn5, rn7), _mm256_setr_epi64x(rn2, rn4, rn6, rn8));
float vrn_vec[8];
rng_helpers::xorshift_plus b(rn1, rn2, rn3, rn4, rn5, rn6, rn7, rn8);
float srn_vec[8];
for (int iter=0; iter < niter; iter++)
{
__m256 vrn = a.gen_floats();
_mm256_storeu_ps(&vrn_vec[0], vrn);
b.gen_floats(srn_vec);
for (int i=0; i<8; i++)
{
if (srn_vec[i] != vrn_vec[i])
{
cout << "S code outputs: ";
print_vec(srn_vec);
cout << "V code outputs: ";
print_vec(vrn_vec);
cout << "rng test failed: scalar and vectorized prngs are out of sync on iter " << i << "!" << endl;
return false;
}
}
}
// Check that array generation is working
int n = 10000;
float *big_scalar = new float[n];
float *big_vector = new float[n];
a.gen_arr(big_vector, n);
for (int i = 0; i < n; i += 8)
b.gen_floats(big_scalar + i);
for (int i = 0; i < n; i++)
{
if (big_scalar[i] < -1 || big_scalar[i] > 1 || big_vector[i] < -1 || big_vector[i] > 1)
{
cout << "Oh no, something isn't outputting a floating point number between (-1, 1). This is very bad!" << endl;
}
if (big_scalar[i] != big_vector[i])
{
cout << "The output of vec_xorshift_plus: gen_arr diverged from the scalar implementation at iteration " << i << endl;
return false;
}
}
cout << "All rng tests passed." << endl;
return true;
}
// Timing test -- how long does it take to generate 64 random bit using vec_xorshift_plus?
inline void time_64_bits(__m256i *out, int niter)
{
fast_rng::vec_xorshift_plus x;
__m256i tmp;
struct timeval tv0 = rng_helpers::get_time();
for (int iter = 0; iter < niter; iter++)
tmp = _mm256_xor_si256(tmp, x.gen_rand_bits());
struct timeval tv1 = rng_helpers::get_time();
double dt = rng_helpers::time_diff(tv0, tv1);
double noutputs = double(niter) * 256; // generate 256 random bits per iteration
double ns_per_output = 1.0e9 * dt / noutputs * 64; // actually, get the time for one bit, then mutiply by 64
_mm256_store_si256(out, tmp);
cout << "64 bit test: time for 64 bits = " << ns_per_output << endl;
}
// Note: #include <unistd.h> needed for usleep()
// Thanks for solving this mystery, Kendrick!
void warm_up_cpu()
{
// A throwaway computation which uses the CPU for ~10^9
// clock cycles. The details (usleep, xor) are to prevent the
// compiler from optimizing it out!
//
// Empirically, this makes timing results more stable (without it,
// the CPU seems to run slow for the first ~10^9 cycles or so.)
long n = 0;
for (long i = 0; i < 1000L * 1000L * 1000L; i++)
n += (i ^ (i-1));
usleep(n % 2);
}
int main()
{
// Warm up the CPU - needed for timing
warm_up_cpu();
// Timing test
__m256i out;
time_64_bits(&out, 1000000);
cout << "--------------------------------------------------" << endl;
// Test to compare the outputs of vec_xorshift_plus and xorshift_plus
test_xorshift();
return 0;
}