Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task04 Никита Вербин SPbSU #134

Open
wants to merge 11 commits into
base: task04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
build
cmake-build*
.vs
.vscode
87 changes: 78 additions & 9 deletions src/cl/matrix_multiplication.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,90 @@

// TILE_SIZE и WORK_PER_THREAD задаются через поле 'defines' в кернел конфиге

__kernel void matrix_multiplication_naive()
{
// TODO
__kernel void matrix_multiplication_naive(__global float *a, __global float *b, __global float *c, unsigned int M,
unsigned int K, unsigned int N) {
int i = get_global_id(0);
int j = get_global_id(1);

if (i >= N || j >= M)
return;
float sum = 0.f;
for (int k = 0; k < K; ++k)
sum += a[j * K + k] * b[k * N + i];
c[j * N + i] = sum;
}

#define access(array, i, j) (array)[(i)][((i) + (j)) % TILE_SIZE]

#ifdef TILE_SIZE
__kernel void matrix_multiplication_local()
{
// TODO
__kernel void matrix_multiplication_local(__global float *a, __global float *b, __global float *c, unsigned int M,
unsigned int K, unsigned int N) {
int global_i = get_global_id(0);
int global_j = get_global_id(1);

int local_i = get_local_id(0);
int local_j = get_local_id(1);
__local float tileA[TILE_SIZE][TILE_SIZE];
__local float tileB[TILE_SIZE][TILE_SIZE];
float sum = 0.f;
for (int tileK = 0; tileK * TILE_SIZE < K; ++tileK) {
if (global_j < M && (tileK * TILE_SIZE + local_i) < K)
access(tileA, local_j, local_i) = a[global_j * K + tileK * TILE_SIZE + local_i];
else
access(tileA, local_j, local_i) = 0.;
if (global_i < N && (tileK * TILE_SIZE + local_j) < K)
tileB[local_j][local_i] = b[global_i + (tileK * TILE_SIZE + local_j) * N];
else
tileB[local_j][local_i] = 0.;
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < TILE_SIZE; ++k) {
sum += access(tileA, local_j, k) * tileB[k][local_i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (global_j < M && global_i < N)
c[global_j * N + global_i] = sum;
}
#endif

#if defined(TILE_SIZE) && defined(WORK_PER_THREAD)
__kernel void matrix_multiplication_local_wpt()
{
// TODO
__kernel void matrix_multiplication_local_wpt(__global float *a, __global float *b, __global float *c, unsigned int M,
unsigned int K, unsigned int N) {
int global_i = get_global_id(0);

int local_i = get_local_id(0);
int local_j = get_local_id(1);

int idY = get_group_id(1);

__local float tileA[TILE_SIZE][TILE_SIZE];
__local float tileB[TILE_SIZE][TILE_SIZE];
float sum[WORK_PER_THREAD];
for (int w = 0; w < WORK_PER_THREAD; ++w)
sum[w] = 0.;
for (int tileK = 0; tileK * TILE_SIZE < K; ++tileK) {
for (int w = 0; w < WORK_PER_THREAD; ++w) {
if ((idY * TILE_SIZE + local_j * WORK_PER_THREAD + w) < M && tileK * TILE_SIZE + local_i < K)
access(tileA, local_j * WORK_PER_THREAD + w, local_i) =
a[(idY * TILE_SIZE + local_j * WORK_PER_THREAD + w) * K + tileK * TILE_SIZE + local_i];
else
access(tileA, local_j * WORK_PER_THREAD + w, local_i) = 0.;
if (global_i < N && (tileK * TILE_SIZE + local_j * WORK_PER_THREAD + w) < K)
tileB[local_j * WORK_PER_THREAD + w][local_i] =
b[global_i + (tileK * TILE_SIZE + local_j * WORK_PER_THREAD + w) * N];
else
tileB[local_j * WORK_PER_THREAD + w][local_i] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < TILE_SIZE; ++k) {
for (int w = 0; w < WORK_PER_THREAD; ++w) {
sum[w] += access(tileA, local_j * WORK_PER_THREAD + w, k) * tileB[k][local_i];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
for (int w = 0; w < WORK_PER_THREAD; ++w)
if (idY * TILE_SIZE + local_j * WORK_PER_THREAD + w < M && global_i < N)
c[(idY * TILE_SIZE + local_j * WORK_PER_THREAD + w) * N + global_i] = sum[w];
}
#endif
57 changes: 48 additions & 9 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,56 @@

#line 6

__kernel void matrix_transpose_naive()
{
// TODO
__kernel void matrix_transpose_naive(__global float *a, __global float *at, unsigned int m, unsigned int k) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i >= k || j >= m)
return;
float x = a[j * k + i];
at[i * m + j] = x;
}

__kernel void matrix_transpose_local_bad_banks()
{
// TODO
#define ONE_DIMENSION_SIZE 16
__kernel void matrix_transpose_local_bad_banks(__global float *a, __global float *at, unsigned int m, unsigned int k) {
int global_i = get_global_id(0);
int global_j = get_global_id(1);
__local float buffer[ONE_DIMENSION_SIZE][ONE_DIMENSION_SIZE];
int local_i = get_local_id(0);
int local_j = get_local_id(1);
if (global_i < k && global_j < m)
buffer[local_j][local_i] = a[global_j * k + global_i];
else
buffer[local_j][local_i] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
if (local_i <= local_j) {
float tmp = buffer[local_j][local_i];
buffer[local_j][local_i] = buffer[local_i][local_j];
buffer[local_i][local_j] = tmp;
}
barrier(CLK_LOCAL_MEM_FENCE);
int new_i = global_j / ONE_DIMENSION_SIZE * ONE_DIMENSION_SIZE;
int new_j = global_i / ONE_DIMENSION_SIZE * ONE_DIMENSION_SIZE;
if (new_i + local_i < m && new_j + local_j < k)
at[(new_j + local_j) * m + new_i + local_i] = buffer[local_j][local_i];
}

__kernel void matrix_transpose_local_good_banks()
{
// TODO
__kernel void matrix_transpose_local_good_banks(__global float *a, __global float *at, unsigned int m, unsigned int k) {
int global_i = get_global_id(0);
int global_j = get_global_id(1);
__local float buffer[(ONE_DIMENSION_SIZE + 1) * ONE_DIMENSION_SIZE];
int local_i = get_local_id(0);
int local_j = get_local_id(1);

if (global_i < k && global_j < m)
buffer[local_j * (ONE_DIMENSION_SIZE + 1) + local_i] = a[global_j * k + global_i];
else
buffer[local_j * (ONE_DIMENSION_SIZE + 1) + local_i] = 0;
float tmp = buffer[local_j * (ONE_DIMENSION_SIZE + 1) + local_i];
barrier(CLK_LOCAL_MEM_FENCE);
buffer[local_i * (ONE_DIMENSION_SIZE + 1) + local_j] = tmp;
barrier(CLK_LOCAL_MEM_FENCE);
int new_i = global_j / ONE_DIMENSION_SIZE * ONE_DIMENSION_SIZE;
int new_j = global_i / ONE_DIMENSION_SIZE * ONE_DIMENSION_SIZE;
if (new_i + local_i < m && new_j + local_j < k)
at[(new_j + local_j) * m + new_i + local_i] = buffer[local_j * (ONE_DIMENSION_SIZE + 1) + local_i];
}
85 changes: 45 additions & 40 deletions src/main_matrix_multiplication.cpp
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
#include <libutils/misc.h>
#include <libutils/timer.h>
#include <libutils/fast_random.h>
#include <libgpu/context.h>
#include <libgpu/shared_device_buffer.h>
#include <libutils/fast_random.h>
#include <libutils/misc.h>
#include <libutils/timer.h>

#include "cl/matrix_multiplication_cl.h"

#include <vector>
#include <iostream>
#include <stdexcept>
#include <vector>

const int benchmarkingIters = 10;
const int benchmarkingItersCPU = 1;
const unsigned int M = 1024;
const unsigned int K = 1024;
const unsigned int N = 1024;
const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения
const size_t gflops =
((size_t) M * K * N * 2) / (1000 * 1000 * 1000);// умножить на два, т.к. операция сложения и умножения

std::vector<float> computeCPU(const float *as, const float *bs)
{
std::vector<float> cs(M*N, 0);
std::vector<float> computeCPU(const float *as, const float *bs) {
std::vector<float> cs(M * N, 0);

timer t;
for (int iter = 0; iter < benchmarkingItersCPU; ++iter) {
Expand Down Expand Up @@ -48,47 +48,54 @@ struct KernelConfig {
std::string prefix;
};

KernelConfig makeNaiveConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
KernelConfig makeNaiveConfig(unsigned int tile_size) {
std::string kernel_name = "matrix_multiplication_naive";
gpu::WorkSize work_size(0, 0/*TODO*/);
const unsigned int groupSizeX = tile_size;
const unsigned int groupSizeY = tile_size;
unsigned int global_work_size_X = (N + groupSizeX - 1) / groupSizeX * groupSizeX;
unsigned int global_work_size_Y = (M + groupSizeY - 1) / groupSizeY * groupSizeY;
gpu::WorkSize work_size(groupSizeX, groupSizeY, global_work_size_X, global_work_size_Y);
std::string defines;
std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
KernelConfig makeLocalConfig(unsigned int tile_size) {
std::string kernel_name = "matrix_multiplication_local";
gpu::WorkSize work_size(0, 0/*TODO*/);
const unsigned int groupSizeX = tile_size;
const unsigned int groupSizeY = tile_size;
unsigned int global_work_size_X = (N + groupSizeX - 1) / groupSizeX * groupSizeX;
unsigned int global_work_size_Y = (M + groupSizeY - 1) / groupSizeY * groupSizeY;
gpu::WorkSize work_size(groupSizeX, groupSizeY, global_work_size_X, global_work_size_Y);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
{
throw std::runtime_error("not implemented");
KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt) {
std::string kernel_name = "matrix_multiplication_local_wpt";
gpu::WorkSize work_size(0, 0/*TODO*/);

const unsigned int groupSizeX = tile_size;
const unsigned int groupSizeY = (tile_size + wpt - 1) / wpt;
unsigned int global_work_size_X = (N + groupSizeX - 1) / groupSizeX * groupSizeX;
unsigned int global_work_size_Y = ((M + wpt - 1) / wpt + groupSizeY - 1) / groupSizeY * groupSizeY;
gpu::WorkSize work_size(groupSizeX, groupSizeY, global_work_size_X, global_work_size_Y);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

void runTest(const KernelConfig &config, const float *as, const float *bs, const float *cs_cpu_reference)
{
void runTest(const KernelConfig &config, const float *as, const float *bs, const float *cs_cpu_reference) {
gpu::gpu_mem_32f as_gpu, bs_gpu, cs_gpu;
as_gpu.resizeN(M*K);
bs_gpu.resizeN(K*N);
cs_gpu.resizeN(M*N);
as_gpu.resizeN(M * K);
bs_gpu.resizeN(K * N);
cs_gpu.resizeN(M * N);

as_gpu.writeN(as, M*K);
bs_gpu.writeN(bs, K*N);
as_gpu.writeN(as, M * K);
bs_gpu.writeN(bs, K * N);

ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, config.kernel_name, config.defines);
ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, config.kernel_name,
config.defines);
matrix_multiplication_kernel.compile();

timer t;
Expand All @@ -101,8 +108,8 @@ void runTest(const KernelConfig &config, const float *as, const float *bs, const
std::cout << " GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << " GPU: " << gflops / t.lapAvg() << " GFlops" << std::endl;

std::vector<float> cs(M*N, 0);
cs_gpu.readN(cs.data(), M*N);
std::vector<float> cs(M * N, 0);
cs_gpu.readN(cs.data(), M * N);

// Проверяем корректность результатов
double diff_sum = 0;
Expand All @@ -116,23 +123,22 @@ void runTest(const KernelConfig &config, const float *as, const float *bs, const
}

double diff_avg = diff_sum / (M * N);
std::cout <<" Average difference: " << diff_avg * 100.0 << "%" << std::endl;
if (diff_avg > 0.01) {
std::cout << " Average difference: " << diff_avg * 100.0 << "%" << std::endl;
if (diff_avg > 0.05) {//// was 0.01
throw std::runtime_error("Too big difference!");
}
}

int main(int argc, char **argv)
{
int main(int argc, char **argv) {
gpu::Device device = gpu::chooseGPUDevice(argc, argv);

gpu::Context context;
context.init(device.device_id_opencl);
context.activate();

std::vector<float> as(M*K, 0);
std::vector<float> bs(K*N, 0);
FastRandom r(M+K+N);
std::vector<float> as(M * K, 0);
std::vector<float> bs(K * N, 0);
FastRandom r(M + K + N);
for (unsigned int i = 0; i < as.size(); ++i) {
as[i] = r.nextf();
}
Expand All @@ -143,8 +149,8 @@ int main(int argc, char **argv)

const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());

// TODO uncomment
return 0;
// // TODO uncomment
// return 0;

runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
Expand All @@ -158,6 +164,5 @@ int main(int argc, char **argv)
for (unsigned int wpt : {2, 4, 8, 16})
if (wpt <= tile_size)
runTest(makeLocalWPTConfig(tile_size, wpt), as.data(), bs.data(), cs_cpu_reference.data());

return 0;
}
Loading
Loading