-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathaggregate.cu
203 lines (175 loc) · 6.11 KB
/
aggregate.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
//compile nvcc *.cu -o test
# include <stdio.h>
# include <stdlib.h>
# include <stdint.h>
# include <time.h>
# include "cuda_runtime.h"
# define INVALID -1LL
# define ITER 4
# define L2_ENTRIES 64
# define GB (1024LU*1024LU*1024LU)
# define MB (1024LU*1024LU)
# define KB (1024LU)
__global__ void global_latency (unsigned long *, unsigned long, unsigned long, unsigned long);
void measure_global (unsigned int, unsigned int, unsigned int);
void create_pattern (unsigned long *arr, unsigned long stride, unsigned long size, unsigned long start_idx) {
unsigned long temp = start_idx, i;
// [Style 1]: Jump for a stride
for (i = start_idx; i < size; i++) {
temp = (i + stride);
// Each index points to next index!
arr[i] = (temp >= size) ? start_idx : temp;
}
// [Style 2]: Jump twice as much as stride
/*stride = 2 * stride;
i = 0;
while (i < L2_ENTRIES) {
arr[temp] = temp + stride;
temp = temp + stride;
i++;
}
// New temp will be (start_idx + original stride) (was doubled earlier)
arr[temp] = start_idx + (stride / 2);
i = 0;
temp = start_idx + (stride / 2);
// Jump twice as much as stride
while (i < L2_ENTRIES) {
arr[temp] = temp + stride;
temp = temp + stride;
i++;
}
// Roundabout!
arr[temp] = start_idx;*/
// [Style 3]: To create a stride like (1, 2, 4, 1 ...) x stride
/* int cycle = 3;
srand (time (0));
unsigned long next_idx = start_idx;
temp = 2 ^ (rand () % cycle) * stride;
while (next_idx + temp < size) {
arr[next_idx] = next_idx + temp;
iteration += 1;
next_idx = next_idx + temp;
temp = 2 ^ (rand () % cycle) * stride;
}
// Roundabout!
arr[next_idx] = start_idx;*/
// [Style 4]: To create L2 amount of accesses and then somewhere in between!
/*for (i = 0; i < L2_ENTRIES; i++) {
arr[temp] = (temp + stride) % size;
temp = (temp + stride) % size;
}
i = start_idx + (stride / 2);
arr[temp] = i;
temp = i + (stride);
arr[i] = temp;
// Roundabout!
arr[temp] = start_idx;*/
// [Style 5]: To create random accesses at stride size!
/* long long probe_points[size];
int count, idx;
// Find out all elements that needs to be accessed!
for (i = 0, count = 0; i < size; i++, count++) {
temp = start_idx + (i + 1) * stride;
if (temp >= size)
break;
probe_points[i] = temp;
}
// Now create access pattern out of them!
srand (time (0));
temp = start_idx;
while (i > 0) {
idx = rand () % count;
if (probe_points[idx] != INVALID) {
arr[temp] = (unsigned long) probe_points[idx];
temp = (unsigned long) probe_points[idx];
probe_points[idx] = INVALID;
i--;
}
}
// Roundabout
arr[temp] = start_idx;*/
}
int main (int argc, char **argv) {
if (argc < 4) {
printf ("Usage: %s from_mb to_mb stride_size_kb\n", argv[0]);
return 0;
}
/* Array size in mega bytes 1st argument. */
unsigned int from_mb = (unsigned int) atof (argv[1]);
/* Array size in mega bytes 1st argument. */
unsigned int to_mb = (unsigned int) atof (argv[2]);
/* Stride size in kilo bytes 1st argument. */
unsigned int stride_size_kb = (unsigned int) atof (argv[3]);
measure_global (from_mb, to_mb, stride_size_kb);
return 0;
}
void measure_global (unsigned int from_mb, unsigned int to_mb, unsigned int stride_kb) {
cudaError_t error_id;
int e_size = sizeof (unsigned long);
unsigned long l_limit = (from_mb * MB) / e_size;
unsigned long u_limit = (to_mb * MB) / e_size;
unsigned long stride = (stride_kb * KB) / e_size;
cudaSetDevice (0);
/* allocate arrays on GPU */
unsigned long *d_a;
error_id = cudaMallocManaged ((void **) &d_a, e_size * (u_limit + 2));
if (error_id != cudaSuccess) {
printf ("Error 1.0 is %s\n", cudaGetErrorString (error_id));
return;
}
/* Find a start address which is good, i.e which we need */
unsigned long start_idx = 0, temp, elements, k, j;
printf ("ArraySize (KB), clock\n");
for (k = l_limit; k <= u_limit; k += stride) {
create_pattern (d_a, stride, k, start_idx);
d_a[k + 1] = 0;
d_a[k + 2] = 0;
cudaDeviceSynchronize ();
dim3 Db = dim3 (1);
dim3 Dg = dim3 (1, 1, 1);
/* launch kernel*/
elements = (k - start_idx) / stride;
global_latency <<<Dg, Db>>>(d_a, k, start_idx, elements);
cudaDeviceSynchronize ();
error_id = cudaGetLastError ();
if (error_id != cudaSuccess) {
printf ("Error kernel is %s\n", cudaGetErrorString (error_id));
}
int sum = 0;
temp = start_idx;
for (j = 0; j < elements; j++) {
sum += d_a[temp + 1];
// printf ("%lu\n", d_a[temp + 1]);
/* Pointer chase! */
temp = d_a[temp];
}
// printf ("===");
printf ("%3.2f, %f\n", (float) e_size * (k - start_idx) / KB, (float) sum / (elements));
}
cudaDeviceReset();
/* free memory on GPU */
cudaFree (d_a);
}
__global__ void global_latency (unsigned long *my_array, unsigned long array_length, unsigned long s_idx, unsigned long elements) {
unsigned long j = s_idx, old_j, k, start_time, end_time, res = 0, res_1 = 0, iter = ITER;
// first round, warm the TLB
for (k = 0; k < elements; k++) {
j = my_array[j];
}
// second round, begin timestamp
// j = (j != s_idx) ? s_idx : j;
for (k = 0; k < elements * iter; k++) {
old_j = j;
start_time = clock ();
j = my_array[j];
res += j;
res_1 += res;
end_time = clock ();
/* We just used it for accessing the element that has to be timed. It will be
at a boundary of the stride chosen. Keep it atleast greater than cache line
size. */
my_array[old_j + 1] = (end_time - start_time);
}
my_array[j + 2/*array_length + 1*/] = res + res_1;
my_array[j + 3/*array_length + 2*/] = my_array[j];
}