Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task04 Кудрявцев Федор HSE #147

Open
wants to merge 1 commit into
base: task04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 65 additions & 10 deletions src/cl/matrix_multiplication.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,76 @@

// TILE_SIZE и WORK_PER_THREAD задаются через поле 'defines' в кернел конфиге

__kernel void matrix_multiplication_naive()
{
// TODO
__kernel void matrix_multiplication_naive(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K, unsigned int N) {
int i = get_global_id(0);
int j = get_global_id(1);
float sum = 0.0f;
for (int k = 0; k < K; ++k) {
sum += a[j * K + k] * b[k * N + i];
}
c[j * N + i] = sum;
}

#ifdef TILE_SIZE
__kernel void matrix_multiplication_local()
{
// TODO
}
__kernel void matrix_multiplication_local(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K, unsigned int N) {
int i = get_global_id(0);
int j = get_global_id(1);
int local_i = get_local_id(0);
int local_j = get_local_id(1);

__local float tileA[TILE_SIZE][TILE_SIZE];
__local float tileB[TILE_SIZE][TILE_SIZE];

float sum = 0.0f;
for (int tileK = 0; tileK * TILE_SIZE < K; tileK++) {
tileA[local_i][local_j] = a[i * K + local_j + tileK * TILE_SIZE];
tileB[local_i][local_j] = b[(local_i + tileK * TILE_SIZE) * N + j];

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 0; i < TILE_SIZE; i++) {
sum += tileA[local_i][i] * tileB[i][local_j];
}

barrier(CLK_LOCAL_MEM_FENCE);
}
c[i * N + j] = sum;
}
#endif

#if defined(TILE_SIZE) && defined(WORK_PER_THREAD)
__kernel void matrix_multiplication_local_wpt()
{
// TODO
__kernel void matrix_multiplication_local_wpt(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K, unsigned int N) {
int i = get_global_id(0);
int j = get_global_id(1);
int local_i = get_local_id(0);
int local_j = get_local_id(1);

__local float tileA[TILE_SIZE][TILE_SIZE];
__local float tileB[TILE_SIZE][TILE_SIZE];

float sum[WORK_PER_THREAD];
for (int i = 0; i < WORK_PER_THREAD; i++) {
sum[i] = 0;
}

for (int tileK = 0; tileK * TILE_SIZE < K; tileK++) {
for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
tileA[local_i * WORK_PER_THREAD + thread][local_j] = a[(i * WORK_PER_THREAD + thread) * K + local_j + tileK * TILE_SIZE];
tileB[local_i * WORK_PER_THREAD + thread][local_j] = b[((local_i * WORK_PER_THREAD + thread) + tileK * TILE_SIZE) * N + j];
}
barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 0; i < TILE_SIZE; i++) {
float tileb = tileB[i][local_j];
for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
sum[thread] += tileA[local_i * WORK_PER_THREAD + thread][i] * tileb;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
c[(i * WORK_PER_THREAD + thread) * N + j] = sum[thread];
}
}
#endif
50 changes: 43 additions & 7 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,56 @@
#include <libgpu/opencl/cl/clion_defines.cl>
#endif


#define TILE_SIZE 16
#line 6

__kernel void matrix_transpose_naive()
__kernel void matrix_transpose_naive( __global float *a, __global float *at, unsigned int m, unsigned int k)
{
// TODO
int i = get_global_id(0);
int j = get_global_id(1);
if (i >= k)
return;
if (j >= m)
return;
float x = a[i * k + j];
at[j * m + i] = x;
}

__kernel void matrix_transpose_local_bad_banks()

__kernel void matrix_transpose_local_bad_banks( __global float *a, __global float *at, unsigned int m, unsigned int k)
{
// TODO
int i = get_global_id(0);
int j = get_global_id(1);
__local float tile[TILE_SIZE][TILE_SIZE];
int local_i = get_local_id(0);
int local_j = get_local_id(1);

if (i >= k)
return;
if (j >= m)
return;

tile[local_i][local_j] = a[i * k + j];
barrier(CLK_LOCAL_MEM_FENCE);
at[j * m + i] = tile[local_i][local_j];
}

__kernel void matrix_transpose_local_good_banks()
__kernel void matrix_transpose_local_good_banks( __global float *a, __global float *at, unsigned int m, unsigned int k)
{
// TODO
int i = get_global_id(0);
int j = get_global_id(1);
__local float tile[TILE_SIZE][TILE_SIZE + 1];
int local_i = get_local_id(0);
int local_j = get_local_id(1);
int indx = local_i * TILE_SIZE + local_j;
int biased_i= indx / (TILE_SIZE + 1);
int biased_j = indx % (TILE_SIZE + 1);
if (i >= k)
return;
if (j >= m)
return;

tile[biased_i][biased_j] = a[i * k + j];
barrier(CLK_LOCAL_MEM_FENCE);
at[j * m + i] = tile[biased_i][biased_j];
}
11 changes: 3 additions & 8 deletions src/main_matrix_multiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,29 +50,26 @@ struct KernelConfig {

KernelConfig makeNaiveConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_naive";
gpu::WorkSize work_size(0, 0/*TODO*/);
gpu::WorkSize work_size(tile_size, tile_size, M, N);
std::string defines;
std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_local";
gpu::WorkSize work_size(0, 0/*TODO*/);
gpu::WorkSize work_size(tile_size, tile_size, M, N);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_local_wpt";
gpu::WorkSize work_size(0, 0/*TODO*/);
gpu::WorkSize work_size(tile_size / wpt, tile_size, M /wpt, N);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
Expand Down Expand Up @@ -143,8 +140,6 @@ int main(int argc, char **argv)

const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());

// TODO uncomment
return 0;

runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
Expand Down
10 changes: 4 additions & 6 deletions src/main_matrix_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <iostream>
#include <stdexcept>

#define TILE_SIZE 16

const int benchmarkingIters = 100;
const unsigned int M = 4096;
const unsigned int K = 4096;
Expand All @@ -33,9 +35,8 @@ void runTest(const std::string &kernel_name, const float *as)
// поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров
// - для 1D, 2D и 3D рабочего пространства соответственно

// TODO uncomment
// gpu::WorkSize work_size(0, 0, 0, 0 /*TODO*/);
// matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);
gpu::WorkSize work_size(TILE_SIZE, TILE_SIZE, M, K);
matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);

t.nextLap();
}
Expand Down Expand Up @@ -74,9 +75,6 @@ int main(int argc, char **argv)
}
std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;

// TODO uncomment
return 0;

runTest("matrix_transpose_naive", as.data());
runTest("matrix_transpose_local_bad_banks", as.data());
runTest("matrix_transpose_local_good_banks", as.data());
Expand Down
Loading