-
Notifications
You must be signed in to change notification settings - Fork 1
/
tlb.cu
229 lines (192 loc) · 7.21 KB
/
tlb.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
//compile nvcc *.cu -o test
# include <stdio.h>
# include <stdlib.h>
# include <stdint.h>
# include <time.h>
# include "cuda_runtime.h"
# define OFFSETS 16
# define INVALID -1LL
# define L2_ENTRIES 1025
# define GB (1024LU*1024LU*1024LU)
# define MB (1024LU*1024LU)
# define KB (1024LU)
__global__ void global_latency (unsigned long *, unsigned long, unsigned long, unsigned long);
void measure_global (unsigned int, unsigned int, unsigned int);
void create_pattern (unsigned long *arr, unsigned long stride, unsigned long size, unsigned long start_idx) {
unsigned long temp = start_idx, i;
// [Style 1]: Jump for a stride
for (i = start_idx; i < size; i++) {
temp = (i + stride);
// Each index points to next index!
arr[i] = (temp >= size) ? start_idx : temp;
}
// [Style 2]: Jump twice as much as stride
/*stride = 2 * stride;
i = 1;
while (i < L2_ENTRIES) {
arr[temp] = temp + stride;
temp = temp + stride;
i++;
}
// New temp will be (start_idx + original stride) (was doubled earlier)
arr[temp] = start_idx + (stride / 2);
i = 1;
temp = start_idx + (stride / 2);
// Jump twice as much as stride
while (i < L2_ENTRIES) {
arr[temp] = temp + stride;
temp = temp + stride;
i++;
}
// Roundabout!
arr[temp] = start_idx;*/
// [Style 3]: To create a stride like (1, 2, 4, 1 ...) x stride
/* int cycle = 3;
srand (time (0));
unsigned long next_idx = start_idx;
temp = 2 ^ (rand () % cycle) * stride;
while (next_idx + temp < size) {
arr[next_idx] = next_idx + temp;
iteration += 1;
next_idx = next_idx + temp;
temp = 2 ^ (rand () % cycle) * stride;
}
// Roundabout!
arr[next_idx] = start_idx;*/
// [Style 4]: To create L2 amount of accesses and then somewhere in between!
/*for (i = 0; i < L2_ENTRIES; i++) {
arr[temp] = (temp + stride) % size;
temp = (temp + stride) % size;
}
i = start_idx + (stride / 2);
arr[temp] = i;
temp = i + (stride);
arr[i] = temp;
// Roundabout!
arr[temp] = start_idx;*/
// [Style 5]: To create random accesses at stride size!
/* long long probe_points[size];
int count, idx;
// Find out all elements that needs to be accessed!
for (i = 0, count = 0; i < size; i++, count++) {
temp = start_idx + (i + 1) * stride;
if (temp >= size)
break;
probe_points[i] = temp;
}
// Now create access pattern out of them!
srand (time (0));
temp = start_idx;
while (i > 0) {
idx = rand () % count;
if (probe_points[idx] != INVALID) {
arr[temp] = (unsigned long) probe_points[idx];
temp = (unsigned long) probe_points[idx];
probe_points[idx] = INVALID;
i--;
}
}
// Roundabout
arr[temp] = start_idx;*/
}
int main (int argc, char **argv) {
if (argc < 4) {
printf ("Usage: %s from_Mb to_Mb stride_size_kb\n", argv[0]);
return 0;
}
/* Array size in mega bytes 1st argument. */
unsigned int from_mb = (unsigned int) atof (argv[1]);
/* Array size in mega bytes 1st argument. */
unsigned int to_mb = (unsigned int) atof (argv[2]);
/* Stride size in kilo bytes 1st argument. */
unsigned int stride_size_kb = (unsigned int) atof (argv[3]);
measure_global (from_mb, to_mb, stride_size_kb);
return 0;
}
void measure_global (unsigned int from_mb, unsigned int to_mb, unsigned int stride_kb) {
cudaError_t error_id;
int e_size = sizeof (unsigned long);
unsigned long ol_limit = (from_mb * MB) / e_size, l_limit;
unsigned long ou_limit = (to_mb * MB) / e_size, u_limit;
unsigned long stride = (stride_kb * KB) / e_size;
/* Always allocate some GB(s) for now */
unsigned long a_size = (2 * GB) / e_size;
unsigned long l_stride = (4 * MB) / e_size;
cudaSetDevice (0);
/* allocate arrays on GPU */
unsigned long *d_a;
error_id = cudaMallocManaged ((void **) &d_a, e_size * (a_size));
if (error_id != cudaSuccess) {
printf ("Error 1.0 is %s\n", cudaGetErrorString (error_id));
return;
}
/* Find a start address which is good, i.e which we need */
unsigned long start_idx = 0, temp, elements, k, j;
// printf ("Start Addr: %p, Idx: %llu\n", start_da, start_idx);
int times = 1;
for (start_idx = start_idx; times <= OFFSETS; start_idx += l_stride * 1, times++){
/* The limits are basically end indices when indexing starts at 0,
shift it by start_idx! */
l_limit = (ol_limit + start_idx);
u_limit = (ou_limit + start_idx);
if (u_limit >= a_size)
break;
// printf ("lLimit: %llu, uLimit: %llu\n", l_limit, u_limit);
for (k = l_limit; k <= u_limit; k += stride) {
// start_idx = start_idx + (256 * KB / e_size);
/* Initialize array elements on CPU with pointers into d_a. */
create_pattern (d_a, stride, k, start_idx);
d_a[k + 1] = 0;
d_a[k + 2] = 0;
cudaDeviceSynchronize ();
dim3 Db = dim3 (1);
dim3 Dg = dim3 (1, 1, 1);
/* launch kernel*/
elements = (k - start_idx) / stride;
global_latency <<<Dg, Db>>>(d_a, k, start_idx, elements);
cudaDeviceSynchronize ();
error_id = cudaGetLastError ();
if (error_id != cudaSuccess) {
printf ("Error kernel is %s\n", cudaGetErrorString (error_id));
}
printf ("===============================================\n");
printf ("%ld.%02d, %lu KB\n", e_size * (k - start_idx) / KB, times, (stride * e_size) / KB);
printf ("%lu\n", start_idx);
/* Response structure [Next Address][Address][Time to access it] ... [Next Address]... */
temp = start_idx;
for (j = 0; j < elements; j++) {
printf ("%p, %lu, %lu\n", &d_a[temp], temp, d_a[temp + 1]);
/* Pointer chase! */
temp = d_a[temp];
}
printf ("===============================================\n\n");
}
}
/* free memory on GPU */
cudaFree (d_a);
cudaDeviceReset();
}
__global__ void global_latency (unsigned long *my_array, unsigned long array_length, unsigned long s_idx, unsigned long elements) {
unsigned long j = s_idx, old_j, k, start_time, end_time, res = 0, res_1 = 0, iter = 3;
// first round, warm the TLB
// for (k = 0; k < elements; k++) {
// j = my_array[j];
// }
// second round, begin timestamp
// j = (j != s_idx) ? s_idx : j;
for (k = 0; k < elements * iter; k++) {
old_j = j;
__threadfence ();
start_time = clock ();
j = my_array[j];
res += j;
res_1 += res;
end_time = clock ();
/* We just used it for accessing the element that has to be timed. It will be
at a boundary of the stride chosen. Keep it atleast greater than cache line
size. */
my_array[old_j + 1] = (end_time - start_time);
}
my_array[j + 2/*array_length + 1*/] = res + res_1;
my_array[j + 3/*array_length + 2*/] = my_array[j];
}