Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task04 Ельцов Данил HSE #137

Open
wants to merge 2 commits into
base: task04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 111 additions & 9 deletions src/cl/matrix_multiplication.cl
Original file line number Diff line number Diff line change
@@ -1,27 +1,129 @@
#ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>

#include <libgpu/opencl/cl/clion_defines.cl>

#endif


#line 6

// TILE_SIZE и WORK_PER_THREAD задаются через поле 'defines' в кернел конфиге

__kernel void matrix_multiplication_naive()
{
// TODO
__kernel void matrix_multiplication_naive(
__global const float *a,
__global const float *b,
__global float *out,
const unsigned int M,
const unsigned int K,
const unsigned int N
) {
const unsigned int j = get_global_id(0);
const unsigned int i = get_global_id(1);

if (i >= M || j >= N)
return;

float sum = 0;
for (int k = 0; k < K; ++k) {
sum += a[i * K + k] * b[k * N + j];
}

out[i * N + j] = sum;
}

#ifdef TILE_SIZE
__kernel void matrix_multiplication_local()
__kernel void matrix_multiplication_local(
__global const float *a,
__global const float *b,
__global float *out,
const unsigned int M,
const unsigned int K,
const unsigned int N
)
{
// TODO
const unsigned int j = get_global_id(0);
const unsigned int i = get_global_id(1);

const unsigned int jj = get_local_id(0);
const unsigned int ii = get_local_id(1);

__local float aa[TILE_SIZE][TILE_SIZE];
__local float bb[TILE_SIZE][TILE_SIZE];
float sum = 0;
for (int tile = 0; tile * TILE_SIZE < K; ++tile) {

if (i < M && (tile * TILE_SIZE + jj) < K)
aa[ii][jj] = a[i * K + (tile * TILE_SIZE + jj)];
else
aa[ii][jj] = 0;

if ((tile * TILE_SIZE + ii) < K && j < N)
bb[ii][jj] = b[(tile * TILE_SIZE + ii) * N + j];
else
bb[ii][jj] = 0;

barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < TILE_SIZE; ++k) {
sum += aa[ii][k] * bb[k][jj];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (i < M && j < N)
out[i * N + j] = sum;
}
#endif

#if defined(TILE_SIZE) && defined(WORK_PER_THREAD)
__kernel void matrix_multiplication_local_wpt()
{
// TODO
__kernel void matrix_multiplication_local_wpt(
__global const float *a,
__global const float *b,
__global float *out,
const unsigned int M,
const unsigned int K,
const unsigned int N
) {
const unsigned int j = get_global_id(0);
const unsigned int i = get_global_id(1) * WORK_PER_THREAD;

const unsigned int jj = get_local_id(0);
const unsigned int ii = get_local_id(1) * WORK_PER_THREAD;

float sum[WORK_PER_THREAD];
for (int q = 0; q < WORK_PER_THREAD; ++q) {
sum[q] = 0;
}

__local float aa[TILE_SIZE][TILE_SIZE];
__local float bb[TILE_SIZE][TILE_SIZE];

for (int tile = 0; tile * TILE_SIZE < K; ++tile) {

for (int q = 0; q < WORK_PER_THREAD; ++q) {
if (i + q < M && (tile * TILE_SIZE + jj) < K)
aa[ii + q][jj] = a[(i + q) * K + (tile * TILE_SIZE + jj)];
else
aa[ii + q][jj] = 0;

if ((tile * TILE_SIZE + ii + q) < K && j < N)
bb[ii + q][jj] = b[(tile * TILE_SIZE + ii + q) * N + j];
else
bb[ii + q][jj] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);

for (int k = 0; k < TILE_SIZE; ++k) {
const float b_cache = bb[k][jj];
for (int q = 0; q < WORK_PER_THREAD; ++q) {
sum[q] += aa[ii + q][k] * b_cache;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int q = 0; q < WORK_PER_THREAD; ++q) {
if (i + q < M && j < N)
out[(i + q) * N + j] = sum[q];
}
}
#endif
79 changes: 69 additions & 10 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
@@ -1,21 +1,80 @@
#ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>

#include <libgpu/opencl/cl/clion_defines.cl>

#endif


#line 6

__kernel void matrix_transpose_naive()
{
// TODO
__kernel void matrix_transpose_naive(
__global const float *a,
__global float *at,
const unsigned int M,
const unsigned int K
) {
const unsigned int j = get_global_id(0);
const unsigned int i = get_global_id(1);
if (i < M && j < K)
at[j * M + i] = a[i * K + j];
}

__kernel void matrix_transpose_local_bad_banks()
{
// TODO
#define TILE_SIZE 16
__kernel void matrix_transpose_local_bad_banks(
__global const float *a,
__global float *at,
const unsigned int M,
const unsigned int K
) {
const unsigned int j = get_global_id(0);
const unsigned int i = get_global_id(1);

const unsigned int jj = get_local_id(0);
const unsigned int ii = get_local_id(1);

__local float cache[TILE_SIZE][TILE_SIZE];

if (i < M && j < K)
cache[ii][jj] = a[i * K + j];
else
cache[ii][jj] = 0;
// почему то оператор ниже не работает, если выключить верхний if :(
// cache[ii][jj] = (i < M && j < K) ? a[i * K + j] : 0;
float value = cache[ii][jj];
barrier(CLK_LOCAL_MEM_FENCE);
if (jj < ii){
cache[ii][jj] = cache[jj][ii];
cache[jj][ii] = value;
}
barrier(CLK_LOCAL_MEM_FENCE);

const unsigned int target_j = get_group_id(0) * TILE_SIZE + ii;
const unsigned int target_i = get_group_id(1) * TILE_SIZE + jj;
if (target_i < M && target_j < K)
at[target_j * M + target_i] = cache[ii][jj];
}

__kernel void matrix_transpose_local_good_banks()
{
// TODO
__kernel void matrix_transpose_local_good_banks(
__global const float *a,
__global float *at,
const unsigned int M,
const unsigned int K

) {
const unsigned int j = get_global_id(0);
const unsigned int i = get_global_id(1);

const unsigned int jj = get_local_id(0);
const unsigned int ii = get_local_id(1);

__local float cache[TILE_SIZE][TILE_SIZE];

float value = (i < M && j < K) ? a[i * K + j] : 0;
cache[jj][(ii + jj) % TILE_SIZE] = value;
barrier(CLK_LOCAL_MEM_FENCE);

const unsigned int target_j = get_group_id(0) * TILE_SIZE + ii;
const unsigned int target_i = get_group_id(1) * TILE_SIZE + jj;
if (target_i < M && target_j < K)
at[target_j * M + target_i] = cache[ii][(ii + jj) % TILE_SIZE];
}
22 changes: 10 additions & 12 deletions src/main_matrix_multiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
#include <iostream>
#include <stdexcept>

const int benchmarkingIters = 10;
const int benchmarkingIters = 100;
const int benchmarkingItersCPU = 1;
const unsigned int M = 1024;
const unsigned int K = 1024;
const unsigned int N = 1024;
const unsigned int M = 1024 + 1;
const unsigned int K = 1024 + 13;
const unsigned int N = 1024 + 3;
const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения

std::vector<float> computeCPU(const float *as, const float *bs)
Expand Down Expand Up @@ -50,29 +50,28 @@ struct KernelConfig {

KernelConfig makeNaiveConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_naive";
gpu::WorkSize work_size(0, 0/*TODO*/);
gpu::WorkSize work_size(tile_size, tile_size, (N + tile_size - 1) / tile_size * tile_size, (M + tile_size - 1) / tile_size * tile_size);
std::string defines;
std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_local";
gpu::WorkSize work_size(0, 0/*TODO*/);
gpu::WorkSize work_size(tile_size, tile_size, (N + tile_size - 1) / tile_size * tile_size, (M + tile_size - 1) / tile_size * tile_size);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_local_wpt";
gpu::WorkSize work_size(0, 0/*TODO*/);
unsigned int workGroupX = tile_size;
unsigned int workGroupY = tile_size / wpt;
gpu::WorkSize work_size(workGroupX, workGroupY, (N + workGroupX - 1) / workGroupX * workGroupX, (M + tile_size - 1) / tile_size * tile_size / wpt);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
Expand Down Expand Up @@ -143,8 +142,6 @@ int main(int argc, char **argv)

const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());

// TODO uncomment
return 0;

runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
Expand All @@ -159,5 +156,6 @@ int main(int argc, char **argv)
if (wpt <= tile_size)
runTest(makeLocalWPTConfig(tile_size, wpt), as.data(), bs.data(), cs_cpu_reference.data());


return 0;
}
16 changes: 7 additions & 9 deletions src/main_matrix_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
#include <iostream>
#include <stdexcept>

const int benchmarkingIters = 100;
const unsigned int M = 4096;
const unsigned int K = 4096;
const int benchmarkingIters = 1000;
const unsigned int M = 4096 + 3;
const unsigned int K = 4096 + 13;

void runTest(const std::string &kernel_name, const float *as)
{
Expand All @@ -33,9 +33,10 @@ void runTest(const std::string &kernel_name, const float *as)
// поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров
// - для 1D, 2D и 3D рабочего пространства соответственно

// TODO uncomment
// gpu::WorkSize work_size(0, 0, 0, 0 /*TODO*/);
// matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);

unsigned int blockSize = 16;
gpu::WorkSize work_size(blockSize, blockSize, (K + blockSize - 1) / blockSize * blockSize, (M + blockSize - 1) / blockSize * blockSize);
matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);

t.nextLap();
}
Expand Down Expand Up @@ -74,9 +75,6 @@ int main(int argc, char **argv)
}
std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;

// TODO uncomment
return 0;

runTest("matrix_transpose_naive", as.data());
runTest("matrix_transpose_local_bad_banks", as.data());
runTest("matrix_transpose_local_good_banks", as.data());
Expand Down
Loading