GPGPUCourse · sthfaceless · Sep 14, 2024 · Oct 3, 2024
diff --git a/src/cl/matrix_multiplication.cl b/src/cl/matrix_multiplication.cl
@@ -1,27 +1,129 @@
 #ifdef __CLION_IDE__
- #include <libgpu/opencl/cl/clion_defines.cl>
+
+#include <libgpu/opencl/cl/clion_defines.cl>
+
 #endif
 
 
 #line 6
 
 // TILE_SIZE и WORK_PER_THREAD задаются через поле 'defines' в кернел конфиге
 
-__kernel void matrix_multiplication_naive()
-{
- // TODO
+__kernel void matrix_multiplication_naive(
+ __global const float *a,
+ __global const float *b,
+ __global float *out,
+ const unsigned int M,
+ const unsigned int K,
+ const unsigned int N
+) {
+ const unsigned int j = get_global_id(0);
+ const unsigned int i = get_global_id(1);
+
+ if (i >= M || j >= N)
+ return;
+
+ float sum = 0;
+ for (int k = 0; k < K; ++k) {
+ sum += a[i * K + k] * b[k * N + j];
+ }
+
+ out[i * N + j] = sum;
 }
 
 #ifdef TILE_SIZE
-__kernel void matrix_multiplication_local()
+__kernel void matrix_multiplication_local(
+ __global const float *a,
+ __global const float *b,
+ __global float *out,
+ const unsigned int M,
+ const unsigned int K,
+ const unsigned int N
+ )
 {
- // TODO
+ const unsigned int j = get_global_id(0);
+ const unsigned int i = get_global_id(1);
+
+ const unsigned int jj = get_local_id(0);
+ const unsigned int ii = get_local_id(1);
+
+ __local float aa[TILE_SIZE][TILE_SIZE];
+ __local float bb[TILE_SIZE][TILE_SIZE];
+ float sum = 0;
+ for (int tile = 0; tile * TILE_SIZE < K; ++tile) {
+
+ if (i < M && (tile * TILE_SIZE + jj) < K)
+ aa[ii][jj] = a[i * K + (tile * TILE_SIZE + jj)];
+ else
+ aa[ii][jj] = 0;
+
+ if ((tile * TILE_SIZE + ii) < K && j < N)
+ bb[ii][jj] = b[(tile * TILE_SIZE + ii) * N + j];
+ else
+ bb[ii][jj] = 0;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int k = 0; k < TILE_SIZE; ++k) {
+ sum += aa[ii][k] * bb[k][jj];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (i < M && j < N)
+ out[i * N + j] = sum;
 }
 #endif
 
 #if defined(TILE_SIZE) && defined(WORK_PER_THREAD)
-__kernel void matrix_multiplication_local_wpt()
-{
- // TODO
+__kernel void matrix_multiplication_local_wpt(
+ __global const float *a,
+ __global const float *b,
+ __global float *out,
+ const unsigned int M,
+ const unsigned int K,
+ const unsigned int N
+) {
+ const unsigned int j = get_global_id(0);
+ const unsigned int i = get_global_id(1) * WORK_PER_THREAD;
+
+ const unsigned int jj = get_local_id(0);
+ const unsigned int ii = get_local_id(1) * WORK_PER_THREAD;
+
+ float sum[WORK_PER_THREAD];
+ for (int q = 0; q < WORK_PER_THREAD; ++q) {
+ sum[q] = 0;
+ }
+
+ __local float aa[TILE_SIZE][TILE_SIZE];
+ __local float bb[TILE_SIZE][TILE_SIZE];
+
+ for (int tile = 0; tile * TILE_SIZE < K; ++tile) {
+
+ for (int q = 0; q < WORK_PER_THREAD; ++q) {
+ if (i + q < M && (tile * TILE_SIZE + jj) < K)
+ aa[ii + q][jj] = a[(i + q) * K + (tile * TILE_SIZE + jj)];
+ else
+ aa[ii + q][jj] = 0;
+
+ if ((tile * TILE_SIZE + ii + q) < K && j < N)
+ bb[ii + q][jj] = b[(tile * TILE_SIZE + ii + q) * N + j];
+ else
+ bb[ii + q][jj] = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int k = 0; k < TILE_SIZE; ++k) {
+ const float b_cache = bb[k][jj];
+ for (int q = 0; q < WORK_PER_THREAD; ++q) {
+ sum[q] += aa[ii + q][k] * b_cache;
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ for (int q = 0; q < WORK_PER_THREAD; ++q) {
+ if (i + q < M && j < N)
+ out[(i + q) * N + j] = sum[q];
+ }
 }
 #endif
diff --git a/src/cl/matrix_transpose.cl b/src/cl/matrix_transpose.cl
@@ -1,21 +1,80 @@
 #ifdef __CLION_IDE__
- #include <libgpu/opencl/cl/clion_defines.cl>
+
+#include <libgpu/opencl/cl/clion_defines.cl>
+
 #endif
 
 
 #line 6
 
-__kernel void matrix_transpose_naive()
-{
- // TODO
+__kernel void matrix_transpose_naive(
+ __global const float *a,
+ __global float *at,
+ const unsigned int M,
+ const unsigned int K
+) {
+ const unsigned int j = get_global_id(0);
+ const unsigned int i = get_global_id(1);
+ if (i < M && j < K)
+ at[j * M + i] = a[i * K + j];
 }
 
-__kernel void matrix_transpose_local_bad_banks()
-{
- // TODO
+#define TILE_SIZE 16
+__kernel void matrix_transpose_local_bad_banks(
+ __global const float *a,
+ __global float *at,
+ const unsigned int M,
+ const unsigned int K
+) {
+ const unsigned int j = get_global_id(0);
+ const unsigned int i = get_global_id(1);
+
+ const unsigned int jj = get_local_id(0);
+ const unsigned int ii = get_local_id(1);
+
+ __local float cache[TILE_SIZE][TILE_SIZE];
+
+ if (i < M && j < K)
+ cache[ii][jj] = a[i * K + j];
+ else
+ cache[ii][jj] = 0;
+// почему то оператор ниже не работает, если выключить верхний if :(
+// cache[ii][jj] = (i < M && j < K) ? a[i * K + j] : 0;
+ float value = cache[ii][jj];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (jj < ii){
+ cache[ii][jj] = cache[jj][ii];
+ cache[jj][ii] = value;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ const unsigned int target_j = get_group_id(0) * TILE_SIZE + ii;
+ const unsigned int target_i = get_group_id(1) * TILE_SIZE + jj;
+ if (target_i < M && target_j < K)
+ at[target_j * M + target_i] = cache[ii][jj];
 }
 
-__kernel void matrix_transpose_local_good_banks()
-{
- // TODO
+__kernel void matrix_transpose_local_good_banks(
+ __global const float *a,
+ __global float *at,
+ const unsigned int M,
+ const unsigned int K
+
+) {
+ const unsigned int j = get_global_id(0);
+ const unsigned int i = get_global_id(1);
+
+ const unsigned int jj = get_local_id(0);
+ const unsigned int ii = get_local_id(1);
+
+ __local float cache[TILE_SIZE][TILE_SIZE];
+
+ float value = (i < M && j < K) ? a[i * K + j] : 0;
+ cache[jj][(ii + jj) % TILE_SIZE] = value;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ const unsigned int target_j = get_group_id(0) * TILE_SIZE + ii;
+ const unsigned int target_i = get_group_id(1) * TILE_SIZE + jj;
+ if (target_i < M && target_j < K)
+ at[target_j * M + target_i] = cache[ii][(ii + jj) % TILE_SIZE];
 }
diff --git a/src/main_matrix_multiplication.cpp b/src/main_matrix_multiplication.cpp
@@ -10,11 +10,11 @@
 #include <iostream>
 #include <stdexcept>
 
-const int benchmarkingIters = 10;
+const int benchmarkingIters = 100;
 const int benchmarkingItersCPU = 1;
-const unsigned int M = 1024;
-const unsigned int K = 1024;
-const unsigned int N = 1024;
+const unsigned int M = 1024 + 1;
+const unsigned int K = 1024 + 13;
+const unsigned int N = 1024 + 3;
 const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения
 
 std::vector<float> computeCPU(const float *as, const float *bs)
@@ -50,29 +50,28 @@ struct KernelConfig {
 
 KernelConfig makeNaiveConfig(unsigned int tile_size)
 {
- throw std::runtime_error("not implemented");
  std::string kernel_name = "matrix_multiplication_naive";
- gpu::WorkSize work_size(0, 0/*TODO*/);
+ gpu::WorkSize work_size(tile_size, tile_size, (N + tile_size - 1) / tile_size * tile_size, (M + tile_size - 1) / tile_size * tile_size);
  std::string defines;
  std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
  return KernelConfig{kernel_name, work_size, defines, prefix};
 }
 
 KernelConfig makeLocalConfig(unsigned int tile_size)
 {
- throw std::runtime_error("not implemented");
  std::string kernel_name = "matrix_multiplication_local";
- gpu::WorkSize work_size(0, 0/*TODO*/);
+ gpu::WorkSize work_size(tile_size, tile_size, (N + tile_size - 1) / tile_size * tile_size, (M + tile_size - 1) / tile_size * tile_size);
  std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
  std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
  return KernelConfig{kernel_name, work_size, defines, prefix};
 }
 
 KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
 {
- throw std::runtime_error("not implemented");
  std::string kernel_name = "matrix_multiplication_local_wpt";
- gpu::WorkSize work_size(0, 0/*TODO*/);
+ unsigned int workGroupX = tile_size;
+ unsigned int workGroupY = tile_size / wpt;
+ gpu::WorkSize work_size(workGroupX, workGroupY, (N + workGroupX - 1) / workGroupX * workGroupX, (M + tile_size - 1) / tile_size * tile_size / wpt);
  std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
  std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
  return KernelConfig{kernel_name, work_size, defines, prefix};
@@ -143,8 +142,6 @@ int main(int argc, char **argv)
 
  const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());
 
- // TODO uncomment
- return 0;
 
  runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
  runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
@@ -159,5 +156,6 @@ int main(int argc, char **argv)
  if (wpt <= tile_size)
  runTest(makeLocalWPTConfig(tile_size, wpt), as.data(), bs.data(), cs_cpu_reference.data());
 
+
  return 0;
 }
diff --git a/src/main_matrix_transpose.cpp b/src/main_matrix_transpose.cpp
@@ -10,9 +10,9 @@
 #include <iostream>
 #include <stdexcept>
 
-const int benchmarkingIters = 100;
-const unsigned int M = 4096;
-const unsigned int K = 4096;
+const int benchmarkingIters = 1000;
+const unsigned int M = 4096 + 3;
+const unsigned int K = 4096 + 13;
 
 void runTest(const std::string &kernel_name, const float *as)
 {
@@ -33,9 +33,10 @@ void runTest(const std::string &kernel_name, const float *as)
  // поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров
  // - для 1D, 2D и 3D рабочего пространства соответственно
 
- // TODO uncomment
-// gpu::WorkSize work_size(0, 0, 0, 0 /*TODO*/);
-// matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);
+
+ unsigned int blockSize = 16;
+ gpu::WorkSize work_size(blockSize, blockSize, (K + blockSize - 1) / blockSize * blockSize, (M + blockSize - 1) / blockSize * blockSize);
+ matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);
 
  t.nextLap();
  }
@@ -74,9 +75,6 @@ int main(int argc, char **argv)
  }
  std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;
 
- // TODO uncomment
- return 0;
-
  runTest("matrix_transpose_naive", as.data());
  runTest("matrix_transpose_local_bad_banks", as.data());
  runTest("matrix_transpose_local_good_banks", as.data());