You must be signed in to change notification settings - Fork 3
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Jun 25, 2023
15 revisions
#include <iostream>
#include <fstream>
// uncomment this if you use opencl v2.0 or v3.0 devices. By default, opencl v1.2 devices are queried.
// must be defined before including "gpgpu.hpp"
#include "gpgpu.hpp"
int main()
const int n = 1024; // number of array elements to test
GPGPU::Computer computer(GPGPU::Computer::DEVICE_ALL); // allocate all devices for computations
// compile a kernel to do the adding C=A+B for all elmeents
kernel void vectorAdd(global float * A, global float * B, global float * C)
int id=get_global_id(0);
C[id] = A[id] + B[id];
})", "vectorAdd");
// create host arrays that will be auto-copied-to/from GPUs/CPUs/Accelerators before/after kernel runs
auto A = computer.createHostParameter<float>("A", n, 1, true, false, false);
auto B = computer.createHostParameter<float>("B", n, 1, true, false, false);
auto C = computer.createHostParameter<float>("C", n, 1, false, true, false);
// initialize one element for testing
A.access<float>(400) = 3.0f;
B.access<float>(400) = 0.1415f;
C.access<float>(400) = 0.0f; // this will be PI
// compute, uses all GPUs and other devices with load-balancing to give faster devices more job to minimize overall latency of kernel (including copy latency too)
std::cout << "PI = " << C.access<float>(400) << std::endl;
catch (std::exception& ex)
std::cout << ex.what() << std::endl; // any error is handled here
return 0;
PI = 3.1415
#include <iostream>
// uncomment this if you use opencl v2.0 or v3.0 devices. By default, opencl v1.2 devices are queried.
// must be defined before including "gpgpu.hpp"
#include "gpgpu.hpp"
int main()
constexpr size_t n = 1024 * 1024;
int clonesPerDevice = 1;
GPGPU::Computer computer(GPGPU::Computer::DEVICE_ALL, GPGPU::Computer::DEVICE_SELECTION_ALL, clonesPerDevice);
auto deviceNames = computer.deviceNames();
for (auto d : deviceNames)
std::cout << d << std::endl;
std::cout << "-------------------------------------------" << std::endl;
std::cout << "Starting compilation of kernel..." << std::endl;
#define n )") + std::to_string(n) + std::string(R"(
void kernel fmaTest(global float * a, global float * b)
// global id of this thread
const int id = get_global_id(0);
const int localId = id % 256;
float r1=0.0f;
float r2=0.0f;
float r3=0.0f;
float r4=0.0f;
float a1=a[id];
float a2=a[(id+1)%n];
float a3=a[(id+2)%n];
float a4=a[(id+3)%n];
local float tmp[256];
for(int i=0;i<n;i+=256)
tmp[localId] = a[i];
for(int j=0;j<256;j++)
float r0 = tmp[j];
r1 = fma(a1,r0,r1);
r2 = fma(a2,r0,r2);
r3 = fma(a3,r0,r3);
r4 = fma(a4,r0,r4);
b[id] = r1+r2+r3+r4;
)"), "fmaTest");
std::cout << "-------------------------------------------" << std::endl;
std::cout << "Starting allocation of host buffer..." << std::endl;
// create parameters of kernel (also allocated in each device)
bool isAinput = true;
bool isBinput = false;
bool isAoutput = false;
bool isBoutput = true;
bool isInputRandomAccess = true;
int dataElementsPerThread = 1;
GPGPU::HostParameter a = computer.createHostParameter<float>("a", n, dataElementsPerThread, isAinput, isAoutput, isInputRandomAccess);
GPGPU::HostParameter b = computer.createHostParameter<float>("b", n, dataElementsPerThread, isBinput, isBoutput, false);
// init elements of parameters
for (int i = 0; i < n; i++)
a.access<float>(i) = i;
b.access<float>(i) = 0;
// set kernel parameters (0: first parameter of kernel, 1: second parameter of kernel)
computer.setKernelParameter("fmaTest", "a", 0);
computer.setKernelParameter("fmaTest", "b", 1);
int repeat = 100;
std::cout << "-------------------------------------------" << std::endl;
std::cout << "Warming up... (20 iterations)" << std::endl;
// copies input elements (a) to devices, runs kernel on devices, copies output elements to RAM (b), uses n/4 total threads distributed to devices, 256 threads per work-group in devices
// faster devices are given more threads automatically (after every call to run method)
size_t nano;
// warm-up runs
for(int i=0;i<20;i++)
std::cout << "i=" << i << std::endl;
computer.run("fmaTest", 0, n, 256); // n/4 number of total threads, 256 local threads per work group
std::cout << "Starting computation for " << repeat << " times..." << std::endl;
std::vector<double> workloadRatios;
GPGPU::Bench bench(&nano);
for (int i = 0; i < repeat; i++)
std::cout << "i=" << i << std::endl;
workloadRatios = computer.run("fmaTest", 0, n, 256); // n/4 number of total threads, 256 local threads per work group
std::cout << nano / 1000000000.0 << " seconds" << std::endl;
std::cout << (((repeat * (double)n * (double)n * 8) / (nano / 1000000000.0)) / 1000000000.0) << " gflops" << std::endl;
for (int i = 0; i < deviceNames.size(); i++)
std::cout << deviceNames[i] << " has workload ratio of: " << workloadRatios[i] << std::endl;
catch (std::exception& ex)
std::cout << ex.what() << std::endl;
return 0;
Starting computation for 100 times...
263.083 seconds
3343.46 gflops
Device 0: GeForce GT 1030 (OpenCL 1.2 CUDA ) [direct-RAM-access disabled] has workload ratio of: 0.324463
Device 1: gfx1036 (OpenCL 2.0 AMD-APP (3444.0) )[has direct access to RAM] [direct-RAM-access disabled] has workload ratio of: 0.151855
Device 2: AMD Ryzen 9 7900 12-Core Processor (OpenCL 3.0 (Build 0) )[has direct access to RAM] has workload ratio of: 0.523682
#include <iostream>
#include <fstream>
// uncomment this if you use opencl v2.0 or v3.0 devices. By default, opencl v1.2 devices are queried.
// must be defined before including "gpgpu.hpp"
#include "gpgpu.hpp"
int main()
constexpr int numParticles = 1024*64;
constexpr int numParticlesPerThread = 1;
GPGPU::Computer computer(GPGPU::Computer::DEVICE_ALL); // allocate all devices for computations
auto deviceNames = computer.deviceNames(false);
// compile a kernel to do the adding C=A+B for all elmeents
// naive version
kernel void calcForceThenMove( const global int * numParticles,
const global float * x, const global float * y, const global float * z,
const global float * vx, const global float * vy, const global float * vz,
global float * xResult, global float * yResult, global float * zResult,
global float * vxResult, global float * vyResult, global float * vzResult)
const int id=get_global_id(0);
const float x0 = x[id];
const float y0 = y[id];
const float z0 = z[id];
float fxTotal = 0.0f;
float fyTotal = 0.0f;
float fzTotal = 0.0f;
const float dt = 0.01f;
const int num = numParticles[0];
for(int i=0;i<num;i++)
const float x1 = x[i];
const float y1 = y[i];
const float z1 = z[i];
const float dx = x0-x1;
const float dy = y0-y1;
const float dz = z0-z1;
const float r = sqrt(dx*dx+dy*dy+dz*dz) + 0.00001f;
const float r3div = 1.0f/(r*r*r);
const float fx0 = dx * r3div;
const float fy0 = dy * r3div;
const float fz0 = dz * r3div;
fxTotal += fx0;
fyTotal += fy0;
fzTotal += fz0;
// f=ma but m=1
float vx0 = vx[id] + fxTotal * dt;
float vy0 = vy[id] + fyTotal * dt;
float vz0 = vz[id] + fzTotal * dt;
vxResult[id] = vx0;
vyResult[id] = vy0;
vzResult[id] = vz0;
xResult[id] = x0 + vx0 * dt;
yResult[id] = y0 + vy0 * dt;
zResult[id] = z0 + vz0 * dt;
})", "calcForceThenMove");
// inputs
// the last "true" parameter tells "copy all input elements to each GPU because all are going to be accessed"
auto nParticles = computer.createHostParameter<int>("numParticles", 1, numParticlesPerThread, true, false, true);
nParticles.access<int>(0) = numParticles;
auto x = computer.createHostParameter<float>("x", numParticles, numParticlesPerThread, true, false, true);
auto y = computer.createHostParameter<float>("y", numParticles, numParticlesPerThread, true, false, true);
auto z = computer.createHostParameter<float>("z", numParticles, numParticlesPerThread, true, false, true);
auto vx = computer.createHostParameter<float>("vx", numParticles, numParticlesPerThread, true, false, false);
auto vy = computer.createHostParameter<float>("vy", numParticles, numParticlesPerThread, true, false, false);
auto vz = computer.createHostParameter<float>("vz", numParticles, numParticlesPerThread, true, false, false);
// outputs
auto xResult = computer.createHostParameter<float>("xResult", numParticles, numParticlesPerThread, false, true, false);
auto yResult = computer.createHostParameter<float>("yResult", numParticles, numParticlesPerThread, false, true, false);
auto zResult = computer.createHostParameter<float>("zResult", numParticles, numParticlesPerThread, false, true, false);
auto vxResult = computer.createHostParameter<float>("vxResult", numParticles, numParticlesPerThread, false, true, false);
auto vyResult = computer.createHostParameter<float>("vyResult", numParticles, numParticlesPerThread, false, true, false);
auto vzResult = computer.createHostParameter<float>("vzResult", numParticles, numParticlesPerThread, false, true, false);
auto parameters = nParticles.next(x).next(y).next(z).
// compute, uses all GPUs and other devices with load-balancing to give faster devices more job to minimize overall latency of kernel (including copy latency too)
for(int i=0;i<100;i++)
std::cout << "Iteration-" << i << ":" << std::endl;
auto performances = computer.compute(parameters, "calcForceThenMove", 0, numParticles, 256);
for (int i = 0; i < performances.size(); i++)
std::cout << deviceNames[i] << " computed " << performances[i] * 100.0f << "% of total work" << std::endl;
std::cout << "-----------------------------------------" << std::endl;
catch (std::exception& ex)
std::cout << ex.what() << std::endl; // any error is handled here
return 0;
GeForce GT 1030 computed 25.3906% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 62.5% of total work
GeForce GT 1030 computed 25% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 62.8906% of total work
GeForce GT 1030 computed 24.6094% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 63.2812% of total work
GeForce GT 1030 computed 25% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 62.8906% of total work
GeForce GT 1030 computed 24.6094% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 63.2812% of total work
Mandelbrot-set Generator 16000 x 16000 With Maximum 50 Iterations Per Pixel (56 milliseconds per generation, 2 minutes to create file)
Warning: this program creates a 2GB ppm file!
#include <iostream>
#include <fstream>
// uncomment this if you use opencl v2.0 or v3.0 devices. By default, opencl v1.2 devices are queried.
// must be defined before including "gpgpu.hpp"
#include "gpgpu.hpp"
int main()
constexpr size_t n = 1024*16;
int clonesPerDevice = 2;
GPGPU::Computer computer(GPGPU::Computer::DEVICE_ALL,GPGPU::Computer::DEVICE_SELECTION_ALL,clonesPerDevice);
auto deviceNames = computer.deviceNames();
for (auto d : deviceNames)
std::cout << d << std::endl;
std::cout << "-------------------------------------------" << std::endl;
std::cout << "Starting compilation of kernel..." << std::endl;
// algorithm from: https://github.com/sessamekesh/IndigoCS_Mandelbrot/blob/master/main.cpp
#define n )") + std::to_string(n) + std::string(R"(
int findMandelbrot(float cr, float ci, int max_iterations)
int i = 0;
float zr = 0.0f, zi = 0.0f;
while (i < max_iterations && zr * zr + zi * zi < 4.0f)
float temp = zr * zr - zi * zi + cr;
zi = 2.0f * zr * zi + ci;
zr = temp;
return i;
float mapToReal(int x, int imageWidth, float minR, float maxR)
float range = maxR - minR;
return x * (range / imageWidth) + minR;
float mapToImaginary(int y, int imageHeight, float minI, float maxI)
float range = maxI - minI;
return y * (range / imageHeight) + minI;
void kernel mandelbrot(global unsigned char * b)
// global id of this thread
const int id = get_global_id(0);
const int x = id % n;
const int y = id / n;
float cr = mapToReal(x, n, -1.5f, 0.7f);
float ci = mapToImaginary(y, n, -1.0f, 1.0f);
int mn = findMandelbrot(cr, ci, 50);
b[id] = mn;
)"), "mandelbrot");
std::cout << "-------------------------------------------" << std::endl;
std::cout << "Starting allocation of host buffer..." << std::endl;
// create parameters of kernel (also allocated in each device)
bool isBinput = false;
bool isBoutput = true;
int dataElementsPerThread = 1;
GPGPU::HostParameter b = computer.createHostParameter<unsigned char>("b", n*n, dataElementsPerThread, isBinput, isBoutput, false);
// init elements of parameters
for (int i = 0; i < n*n; i++)
b.access<unsigned char>(i) = 0;
int repeat = 100;
std::cout << "-------------------------------------------" << std::endl;
std::cout << "Starting computation for "<< repeat <<" times..." << std::endl;
size_t nano;
std::vector<double> workloadRatios;
GPGPU::Bench bench(&nano);
for (int i = 0; i < repeat; i++)
std::cout << i << std::endl;
workloadRatios = computer.compute(b,"mandelbrot", 0, n*n , 256,true,1024*1024); // n*n total workitems, 256 local workitems, 1024*1024 load-balancing grain size
std::cout << nano / 1000000000.0 << " seconds" << std::endl;
size_t totalIter = 0;
for (int i = 0; i < n * n; i++)
totalIter += 8/* from mapping functions */ + b.access<unsigned char>(i) ;
totalIter *= 10;
totalIter *= repeat;
std::cout << (((totalIter ) / (nano / 1000000000.0))/1000000000.0) << " gflops" << std::endl;
for (int i = 0; i < deviceNames.size(); i++)
std::cout << deviceNames[i] << " has workload ratio of: " << workloadRatios[i] << std::endl;
std::cout << "Creating 2GB ppm file. This may take a few minutes." << std::endl;
std::ofstream fout("output_image.ppm");
fout << "P3" << std::endl;
fout << n << " " << n << std::endl;
fout << "255" << std::endl;
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
fout << (int) b.access<unsigned char>(i*n+j) << " " << (int)b.access<unsigned char>(i * n + j) << " " << (int)b.access<unsigned char>(i * n + j) << " ";
fout << std::endl;
catch (std::exception& ex)
std::cout << ex.what() << std::endl;
return 0;
5.67168 seconds
1443.46 gflops
Device 0: GeForce GT 1030 (OpenCL 1.2 CUDA ) [direct-RAM-access disabled] has workload ratio of: 0.18633
Device 1: gfx1036 (OpenCL 2.0 AMD-APP (3444.0) )[has direct access to RAM] [direct-RAM-access disabled] has workload ratio of: 0.0428178
Device 2: AMD Ryzen 9 7900 12-Core Processor (OpenCL 3.0 (Build 0) )[has direct access to RAM] has workload ratio of: 0.548504
Device 3: GeForce GT 1030 (OpenCL 1.2 CUDA ) [direct-RAM-access disabled] has workload ratio of: 0.179573
Device 4: gfx1036 (OpenCL 2.0 AMD-APP (3444.0) )[has direct access to RAM] [direct-RAM-access disabled] has workload ratio of: 0.0427749
Creating 2GB ppm file. This may take a few minutes.