-
Notifications
You must be signed in to change notification settings - Fork 64
/
oclMatVecMul.cl
299 lines (248 loc) · 10.9 KB
/
oclMatVecMul.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Matrix-vector multiplication: W = M * V.
* Device code.
*
* This sample implements matrix-vector multiplication.
* It has been written for clarity of exposition to illustrate various OpenCL
* programming principles and optimizatoins, not with the goal of providing
* the most performant generic kernel for matrix-vector multiplication.
*
* CUBLAS provides high-performance matrix-vector multiplication on GPU.
*/
// Matrix multiplication kernel called by MatrixMul()
__kernel void MatVecMulUncoalesced0(const __global float* M,
const __global float* V,
uint width, uint height,
__global float* W)
{
// Row index
uint y = get_global_id(0);
if (y < height) {
// Row pointer
const __global float* row = M + y * width;
// Compute dot product
float dotProduct = 0;
for (int x = 0; x < width; ++x)
dotProduct += row[x] * V[x];
// Write result to global memory
W[y] = dotProduct;
}
}
// Matrix multiplication kernel called by MatrixMul()
__kernel void MatVecMulUncoalesced1(const __global float* M,
const __global float* V,
uint width, uint height,
__global float* W)
{
// Each work-item handles as many matrix rows as necessary
for (uint y = get_global_id(0);
y < height;
y += get_global_size(0))
{
// Row pointer
const __global float* row = M + y * width;
// Compute dot product
float dotProduct = 0;
for (uint x = 0; x < width; ++x)
dotProduct += row[x] * V[x];
// Write result to global memory
W[y] = dotProduct;
}
}
// Matrix multiplication kernel called by MatrixMul()
__kernel void MatVecMulCoalesced0(const __global float* M,
const __global float* V,
uint width, uint height,
__global float* W,
__local float* partialDotProduct)
{
// Each work-group handles as many matrix rows as necessary
for (uint y = get_group_id(0); y < height; y += get_num_groups(0)) {
// Row pointer
const __global float* row = M + y * width;
// Each work-item accumulates as many products as necessary
// into local variable "sum"
float sum = 0;
for (uint x = get_local_id(0); x < width; x += get_local_size(0))
sum += row[x] * V[x];
// Each partial dot product is stored in shared memory
partialDotProduct[get_local_id(0)] = sum;
// Synchronize to make sure each work-item is done updating
// shared memory; this is necessary because in the next step,
// the first work-item needs to read from shared memory
// the partial dot products written by the other work-items
barrier(CLK_LOCAL_MEM_FENCE);
// The first work-item in the work-group adds all partial
// dot products together and writes the result to global memory
if (get_local_id(0) == 0) {
float dotProduct = 0;
for (uint t = 0; t < get_local_size(0); ++t)
dotProduct += partialDotProduct[t];
W[y] = dotProduct;
}
// Synchronize to make sure the first work-item is done with
// reading partialDotProduct
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Matrix multiplication kernel called by MatrixMul()
__kernel void MatVecMulCoalesced1(const __global float* M,
const __global float* V,
uint width, uint height,
__global float* W,
__local float* partialDotProduct)
{
// Each work-group handles as many matrix rows as necessary
for (uint y = get_group_id(0); y < height; y += get_num_groups(0)) {
// Row pointer
const __global float* row = M + y * width;
// Each work-item accumulates as many products as necessary
// into local variable "sum"
float sum = 0;
for (uint x = get_local_id(0); x < width; x += get_local_size(0))
sum += row[x] * V[x];
// Each partial dot product is stored in shared memory
partialDotProduct[get_local_id(0)] = sum;
// Perform parallel reduction to add each work-item's
// partial dot product together
for (uint stride = 1; stride < get_local_size(0); stride *= 2) {
// Synchronize to make sure each work-item is done updating
// shared memory; this is necessary because work-items read
// results that have been written by other work-items
barrier(CLK_LOCAL_MEM_FENCE);
// Index into the "partialDotProduct" array where
// the work-item will write during this step
uint index = 2 * stride * get_local_id(0);
// Check for valid indices
if (index < get_local_size(0)) {
// Add two elements from the "partialDotProduct" array
// and store the result in partialDotProduct[index]
partialDotProduct[index] += partialDotProduct[index + stride];
}
}
// Write the result of the reduction to global memory
if (get_local_id(0) == 0)
W[y] = partialDotProduct[0];
// Synchronize to make sure the first work-item is done with
// reading partialDotProduct
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Matrix multiplication kernel called by MatrixMul()
__kernel void MatVecMulCoalesced2(const __global float* M,
const __global float* V,
uint width, uint height,
__global float* W,
__local float* partialDotProduct)
{
// Each work-group handles as many matrix rows as necessary
for (uint y = get_group_id(0); y < height; y += get_num_groups(0)) {
// Row pointer
const __global float* row = M + y * width;
// Each work-item accumulates as many products as necessary
// into local variable "sum"
float sum = 0;
for (uint x = get_local_id(0); x < width; x += get_local_size(0))
sum += row[x] * V[x];
// Each partial dot product is stored in shared memory
partialDotProduct[get_local_id(0)] = sum;
// Perform parallel reduction to add each work-item's
// partial dot product together
for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2) {
// Synchronize to make sure each work-item is done updating
// shared memory; this is necessary because work-items read
// results that have been written by other work-items
barrier(CLK_LOCAL_MEM_FENCE);
// Only the first work-items in the work-group add elements together
if (get_local_id(0) < stride) {
// Add two elements from the "partialDotProduct" array
// and store the result in partialDotProduct[index]
partialDotProduct[get_local_id(0)] += partialDotProduct[get_local_id(0) + stride];
}
}
// Write the result of the reduction to global memory
if (get_local_id(0) == 0)
W[y] = partialDotProduct[0];
// Synchronize to make sure the first work-item is done with
// reading partialDotProduct
barrier(CLK_LOCAL_MEM_FENCE);
}
}
#define WARP_SIZE 32
__kernel void MatVecMulCoalesced3(const __global float* M,
const __global float* V,
uint width, uint height,
__global float* W,
__local float* partialDotProduct)
{
// Each work-group computes multiple elements of W
for (uint y = get_group_id(0); y < height; y += get_num_groups(0)) {
const __global float* row = M + y * width;
// Each work-item accumulates as many products as necessary
// into local variable "sum"
float sum = 0;
for (uint x = get_local_id(0); x < width; x += get_local_size(0))
sum += row[x] * V[x];
// Each partial dot product is stored in shared memory
partialDotProduct[get_local_id(0)] = sum;
// Perform parallel reduction to add each work-item's
// partial dot product together
// Synchronize to make sure each work-item is done writing to
// partialDotProduct
barrier(CLK_LOCAL_MEM_FENCE);
// Thread local ID within a warp
uint id = get_local_id(0) & (WARP_SIZE - 1);
// Each warp reduces 64 consecutive elements
float warpResult = 0.0f;
if (get_local_id(0) < get_local_size(0)/2 )
{
volatile __local float* p = partialDotProduct + 2 * get_local_id(0) - id;
p[0] += p[32];
p[0] += p[16];
p[0] += p[8];
p[0] += p[4];
p[0] += p[2];
p[0] += p[1];
warpResult = p[0];
}
// Synchronize to make sure each warp is done reading
// partialDotProduct before it is overwritten in the next step
barrier(CLK_LOCAL_MEM_FENCE);
// The first thread of each warp stores the result of the reduction
// at the beginning of partialDotProduct
if (id == 0)
partialDotProduct[get_local_id(0) / WARP_SIZE] = warpResult;
// Synchronize to make sure each warp is done writing to
// partialDotProduct before it is read in the next step
barrier(CLK_LOCAL_MEM_FENCE);
// Number of remaining elements after the first reduction
uint size = get_local_size(0) / (2 * WARP_SIZE);
// get_local_size(0) is less or equal to 512 on NVIDIA GPUs, so
// only a single warp is needed for the following last reduction
// step
if (get_local_id(0) < size / 2) {
volatile __local float* p = partialDotProduct + get_local_id(0);
if (size >= 8)
p[0] += p[4];
if (size >= 4)
p[0] += p[2];
if (size >= 2)
p[0] += p[1];
}
// Write the result of the reduction to global memory
if (get_local_id(0) == 0)
W[y] = partialDotProduct[0];
// Synchronize to make sure the first work-item is done with
// reading partialDotProduct
barrier(CLK_LOCAL_MEM_FENCE);
}
}