-
Notifications
You must be signed in to change notification settings - Fork 0
/
myGEMM.cl
213 lines (162 loc) · 6.83 KB
/
myGEMM.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
//a simpler one
__kernel void myAdd(const int M, const int N, const int K,
const __global float* A,
const __global float* B,
__global float* C)
{
// Thread identifiers
const int globalRow = get_global_id(0); // Row ID of C (0..M)
const int globalCol = get_global_id(1); // Col ID of C (0..N)
// Store the result
C[globalCol*M + globalRow] = A[globalCol*M + globalRow] + B[globalCol*M + globalRow];
}
#define TS 16
/***************************************************************************
column-wised storing of matrix
cache line:
| @: cache hit
V ?: cache miss
N
[B]--------+--------------+
| |@ |
K| |@ |
| |@ |
K +----------+--------------+
[A]-----+ [C]-----------------------+
| | | . |
| | | . |
| | | . |
M| | | . |
+-------+ | ........Xi |
|@@@@@@@| | Xi+1@ |
|@@@@@@@| | Xi+2@ |
| | | . |
+-------+ +-------------------------+
********************************************************/
// First naive implementation
__kernel void myGEMM1(const int M, const int N, const int K,
const __global float* A,
const __global float* B,
__global float* C) {
// Thread identifiers
const int globalRow = get_global_id(0); // Row ID of C (0..M)
const int globalCol = get_global_id(1); // Col ID of C (0..N)
// Compute a single element (loop over K)
float acc = 0.0f;
for (int k=0; k<K; k++) {
acc += A[k*M + globalRow] * B[globalCol*K + k];
}
// Store the result
C[globalCol*M + globalRow] = acc;
}
/***************************************************************************
myGEMM1b is much slower than myGEMM1
because the way OpenCL dispatch work-items:
within a work-group, items are executed dimension by dimension,
first along dim0, then dim1 and dim2.
also note that work-items within a work-group will be executed
in paralel rather than sequencially (power of GPU).
but work-groups may be executed seqencially or to some extent parallel
cache line:
| @: cache hit
V ?: cache miss
N
[B]--------+--------------+
| | @ @ |
K| | @ @ |
| | @ @ |
K +----------+--------------+
[A]-----+ [C]-----------------------+
| | | . |
| | | . |
| | | . |
M| | | . |
+-------+ | ........Xi Xi+1 Xi+2...|
|???????| | ? ? ? |
| | | |
| | | |
+-------+ +-------------------------+
so in ver1 A&C both cached-hitted throughout adjacent work-items
but in ver1b, only B is cached-hitted throughout adjacent work-items
********************************************************/
__kernel void myGEMM1b(const int M, const int N, const int K,
const __global float* A,
const __global float* B,
__global float* C) {
// Thread identifiers
const int globalRow = get_global_id(1); // Row ID of C (0..M)
const int globalCol = get_global_id(0); // Col ID of C (0..N)
// Compute a single element (loop over K)
float acc = 0.0f;
for (int k=0; k<K; k++) {
acc += A[k*M + globalRow] * B[globalCol*K + k];
}
// Store the result
C[globalCol*M + globalRow] = acc;
}
/***************************************************************************
This is a memory-demonding computation:
for each element in C:
mem access: 2K loads + 1 store
computation: K (Mutiply & Accumulate)
so it's 0.5 instruction/memory access, and is very low
so we need to focus on memory access optimization first.
let focus on one work-group, which is TS-by-TS size
requires K-by-TS submatrix of A and K-by-TS submatrx from B
which is way bigger than GPU-cache capacity, so when we
walk through TS-by-TS work-items column-by-column, submatrix
A will be cache-out and cache-in many times so is less efficient
also cache for A may emit cacheline of B
so let's try to manage memory access manually by synchronize the
memory access pattern of all threads in group.
N
[B]--------+---+----------+
| | | |
K| | | |
| | | |
K +----------+---+----------+
[A]-----+ [C]-----------------------+
| | | |
| | | |
| | | |
M| | | TS |
+-------+ | +---+ |
| | | TS| | |
+-------+ | +---+ |
| | | |
+-------+ +-------------------------+
********************************************************/
__kernel void myGEMM2(const int M, const int N, const int K,
const __global float* A,
const __global float* B,
__global float* C) {
// Thread identifiers
const int row = get_local_id(0); // Local row ID (max: TS)
const int col = get_local_id(1); // Local col ID (max: TS)
const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)
// Local memory to fit a tile of TS*TS elements of A and B
__local float Asub[TS][TS];
__local float Bsub[TS][TS];
// Initialise the accumulation register
float acc = 0.0f;
// Loop over all tiles
const int numTiles = K/TS;
for (int t=0; t<numTiles; t++) {
// Load one tile of A and B into local memory
const int tiledRow = TS*t + row;
const int tiledCol = TS*t + col;
Asub[col][row] = A[tiledCol*M + globalRow];
Bsub[col][row] = B[globalCol*K + tiledRow];
// Synchronise to make sure the tile is loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Perform the computation for a single tile
for (int k=0; k<TS; k++) {
acc += Asub[k][row] * Bsub[col][k];
}
// Synchronise before loading the next tile
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store the final result in C
C[globalCol*M + globalRow] = acc;
}