Skip to content

Examples

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Jun 25, 2023 · 15 revisions

Hello World

#include <iostream>
#include <fstream>

// uncomment this if you use opencl v2.0 or v3.0 devices. By default, opencl v1.2 devices are queried. 
// must be defined before including "gpgpu.hpp"
//#define CL_HPP_MINIMUM_OPENCL_VERSION 200

#include "gpgpu.hpp"
int main()
{
    try
    {
        const int n = 1024; // number of array elements to test

        GPGPU::Computer computer(GPGPU::Computer::DEVICE_ALL); // allocate all devices for computations

        // compile a kernel to do the adding C=A+B for all elmeents
        computer.compile(R"(
            kernel void vectorAdd(global float * A, global float * B, global float * C) 
            { 
                int id=get_global_id(0); 
                C[id] = A[id] + B[id];
             })", "vectorAdd");

        // create host arrays that will be auto-copied-to/from GPUs/CPUs/Accelerators before/after kernel runs
        auto A = computer.createHostParameter<float>("A", n, 1, true, false, false);
        auto B = computer.createHostParameter<float>("B", n, 1, true, false, false);
        auto C = computer.createHostParameter<float>("C", n, 1, false, true, false);

        // initialize one element for testing
        A.access<float>(400) = 3.0f;
        B.access<float>(400) = 0.1415f;
        C.access<float>(400) = 0.0f; // this will be PI

        // compute, uses all GPUs and other devices with load-balancing to give faster devices more job to minimize overall latency of kernel (including copy latency too)
        computer.compute(A.next(B).next(C),"vectorAdd",0,n,64);
        std::cout << "PI = " << C.access<float>(400) << std::endl;

    }
    catch (std::exception& ex)
    {
        std::cout << ex.what() << std::endl; // any error is handled here
    }
    return 0;
}

output:

PI = 3.1415

Fused-Multiply-Add Peak Performance Test

#include <iostream>

// uncomment this if you use opencl v2.0 or v3.0 devices. By default, opencl v1.2 devices are queried. 
// must be defined before including "gpgpu.hpp"
//#define CL_HPP_MINIMUM_OPENCL_VERSION 200

#include "gpgpu.hpp"
int main()
{
    try
    {
        constexpr size_t n = 1024 * 1024;
        int clonesPerDevice = 1;
        GPGPU::Computer computer(GPGPU::Computer::DEVICE_ALL, GPGPU::Computer::DEVICE_SELECTION_ALL, clonesPerDevice);


        auto deviceNames = computer.deviceNames();
        for (auto d : deviceNames)
        {
            std::cout << d << std::endl;
        }

        std::cout << "-------------------------------------------" << std::endl;
        std::cout << "Starting compilation of kernel..." << std::endl;

        computer.compile(std::string(R"(
                #define n )") + std::to_string(n) + std::string(R"(
                    
                    void kernel fmaTest(global float * a, global float * b) 
                    {
                        // global id of this thread
                        const int id = get_global_id(0);
                        const int localId = id % 256;
                        float r1=0.0f;
                        float r2=0.0f;
                        float r3=0.0f;
                        float r4=0.0f;
                        float a1=a[id];
                        float a2=a[(id+1)%n];
                        float a3=a[(id+2)%n];
                        float a4=a[(id+3)%n];
                        local float tmp[256];
                        for(int i=0;i<n;i+=256)
                        {
                            tmp[localId] = a[i];
                            barrier(CLK_LOCAL_MEM_FENCE);
                            for(int j=0;j<256;j++)
                            {
                                float r0 = tmp[j];
                                r1 = fma(a1,r0,r1);
                                r2 = fma(a2,r0,r2);
                                r3 = fma(a3,r0,r3);
                                r4 = fma(a4,r0,r4);
                            }
                            
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        b[id] = r1+r2+r3+r4;
                        
                    }
           )"), "fmaTest");


        std::cout << "-------------------------------------------" << std::endl;
        std::cout << "Starting allocation of host buffer..." << std::endl;

        // create parameters of kernel (also allocated in each device)
        bool isAinput = true;
        bool isBinput = false;
        bool isAoutput = false;
        bool isBoutput = true;
        bool isInputRandomAccess = true;
        int dataElementsPerThread = 1;
        GPGPU::HostParameter a = computer.createHostParameter<float>("a", n, dataElementsPerThread, isAinput, isAoutput, isInputRandomAccess);
        GPGPU::HostParameter b = computer.createHostParameter<float>("b", n, dataElementsPerThread, isBinput, isBoutput, false);

        // init elements of parameters
        for (int i = 0; i < n; i++)
        {
            a.access<float>(i) = i;
            b.access<float>(i) = 0;
        }

        // set kernel parameters (0: first parameter of kernel, 1: second parameter of kernel)
        computer.setKernelParameter("fmaTest", "a", 0);
        computer.setKernelParameter("fmaTest", "b", 1);

        int repeat = 100;
        std::cout << "-------------------------------------------" << std::endl;
        std::cout << "Warming up... (20 iterations)" << std::endl;

        // copies input elements (a) to devices, runs kernel on devices, copies output elements to RAM (b), uses n/4 total threads distributed to devices, 256 threads per work-group in devices
        // faster devices are given more threads automatically (after every call to run method)
        size_t nano;

        // warm-up runs
        for(int i=0;i<20;i++)
        {
            std::cout << "i=" << i << std::endl;
            computer.run("fmaTest", 0, n, 256); // n/4 number of total threads, 256 local threads per work group
        }
        std::cout << "Starting computation for " << repeat << " times..." << std::endl;
        std::vector<double> workloadRatios;
        {
            GPGPU::Bench bench(&nano);
            for (int i = 0; i < repeat; i++)
            {
                std::cout << "i=" << i << std::endl;
                workloadRatios = computer.run("fmaTest", 0, n, 256); // n/4 number of total threads, 256 local threads per work group
            }
        }
        std::cout << nano / 1000000000.0 << " seconds" << std::endl;
        std::cout << (((repeat * (double)n * (double)n * 8) / (nano / 1000000000.0)) / 1000000000.0) << " gflops" << std::endl;
        for (int i = 0; i < deviceNames.size(); i++)
        {
            std::cout << deviceNames[i] << " has workload ratio of: " << workloadRatios[i] << std::endl;
        }

    }
    catch (std::exception& ex)
    {
        std::cout << ex.what() << std::endl;
    }
    return 0;
}

Result:

Starting computation for 100 times...
263.083 seconds
3343.46 gflops
Device 0: GeForce GT 1030 (OpenCL 1.2 CUDA ) [direct-RAM-access disabled] has workload ratio of: 0.324463
Device 1: gfx1036 (OpenCL 2.0 AMD-APP (3444.0) )[has direct access to RAM] [direct-RAM-access disabled] has workload ratio of: 0.151855
Device 2: AMD Ryzen 9 7900 12-Core Processor (OpenCL 3.0 (Build 0) )[has direct access to RAM] has workload ratio of: 0.523682

3D - Nbody Algorithm For 64k Particles

#include <iostream>
#include <fstream>

// uncomment this if you use opencl v2.0 or v3.0 devices. By default, opencl v1.2 devices are queried. 
// must be defined before including "gpgpu.hpp"
//#define CL_HPP_MINIMUM_OPENCL_VERSION 200

#include "gpgpu.hpp"
int main()
{
    try
    {
        constexpr int numParticles = 1024*64;
        constexpr int numParticlesPerThread = 1;

        GPGPU::Computer computer(GPGPU::Computer::DEVICE_ALL); // allocate all devices for computations
        auto deviceNames = computer.deviceNames(false);

        // compile a kernel to do the adding C=A+B for all elmeents
        computer.compile(R"(
            // naive version
            kernel void calcForceThenMove(  const global int * numParticles,
                                    const global float * x, const global float * y, const global float * z, 
                                    const global float * vx, const global float * vy, const global float * vz,
                                    global float * xResult, global float * yResult, global float * zResult, 
                                    global float * vxResult, global float * vyResult, global float * vzResult) 
            { 
                const int id=get_global_id(0); 
                const float x0 = x[id];
                const float y0 = y[id];
                const float z0 = z[id];
                float fxTotal = 0.0f;
                float fyTotal = 0.0f;
                float fzTotal = 0.0f;
                const float dt = 0.01f;
                const int num = numParticles[0];
                for(int i=0;i<num;i++)
                {
                    const float x1 = x[i];
                    const float y1 = y[i];
                    const float z1 = z[i];        
                    const float dx = x0-x1;
                    const float dy = y0-y1;
                    const float dz = z0-z1;
                    const float r = sqrt(dx*dx+dy*dy+dz*dz) + 0.00001f;
                    const float r3div = 1.0f/(r*r*r);
                    const float fx0 = dx * r3div;
                    const float fy0 = dy * r3div;
                    const float fz0 = dz * r3div;
                    fxTotal += fx0;
                    fyTotal += fy0;
                    fzTotal += fz0;
                }

                // f=ma but m=1
                float vx0 = vx[id] + fxTotal * dt;
                float vy0 = vy[id] + fyTotal * dt;
                float vz0 = vz[id] + fzTotal * dt;
                vxResult[id] = vx0;
                vyResult[id] = vy0;
                vzResult[id] = vz0;

                xResult[id] = x0 + vx0 * dt;
                yResult[id] = y0 + vy0 * dt;
                zResult[id] = z0 + vz0 * dt;
             })", "calcForceThenMove");

        // inputs
        // the last "true" parameter tells "copy all input elements to each GPU because all are going to be accessed"
        auto nParticles = computer.createHostParameter<int>("numParticles", 1, numParticlesPerThread, true, false, true);
        nParticles.access<int>(0) = numParticles;

        auto x = computer.createHostParameter<float>("x", numParticles, numParticlesPerThread, true, false, true);
        auto y = computer.createHostParameter<float>("y", numParticles, numParticlesPerThread, true, false, true);
        auto z = computer.createHostParameter<float>("z", numParticles, numParticlesPerThread, true, false, true);
        auto vx = computer.createHostParameter<float>("vx", numParticles, numParticlesPerThread, true, false, false);
        auto vy = computer.createHostParameter<float>("vy", numParticles, numParticlesPerThread, true, false, false);
        auto vz = computer.createHostParameter<float>("vz", numParticles, numParticlesPerThread, true, false, false);


        // outputs
        auto xResult = computer.createHostParameter<float>("xResult", numParticles, numParticlesPerThread, false, true, false);
        auto yResult = computer.createHostParameter<float>("yResult", numParticles, numParticlesPerThread, false, true, false);
        auto zResult = computer.createHostParameter<float>("zResult", numParticles, numParticlesPerThread, false, true, false);

        auto vxResult = computer.createHostParameter<float>("vxResult", numParticles, numParticlesPerThread, false, true, false);
        auto vyResult = computer.createHostParameter<float>("vyResult", numParticles, numParticlesPerThread, false, true, false);
        auto vzResult = computer.createHostParameter<float>("vzResult", numParticles, numParticlesPerThread, false, true, false);

        auto parameters = nParticles.next(x).next(y).next(z).
            next(vx).next(vy).next(vz).
            next(xResult).next(yResult).next(zResult).
            next(vxResult).next(vyResult).next(vzResult);
        // compute, uses all GPUs and other devices with load-balancing to give faster devices more job to minimize overall latency of kernel (including copy latency too)

        for(int i=0;i<100;i++)
        {
            std::cout << "Iteration-" << i << ":" << std::endl;
            auto performances = computer.compute(parameters, "calcForceThenMove", 0, numParticles, 256);
            for (int i = 0; i < performances.size(); i++)
            {
                std::cout << deviceNames[i] << " computed " << performances[i] * 100.0f << "% of total work" << std::endl;
            }
            std::cout << "-----------------------------------------" << std::endl;
        }


    }
    catch (std::exception& ex)
    {
        std::cout << ex.what() << std::endl; // any error is handled here
    }
    return 0;
}

output:

Iteration-95:
GeForce GT 1030 computed 25.3906% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 62.5% of total work
-----------------------------------------
Iteration-96:
GeForce GT 1030 computed 25% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 62.8906% of total work
-----------------------------------------
Iteration-97:
GeForce GT 1030 computed 24.6094% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 63.2812% of total work
-----------------------------------------
Iteration-98:
GeForce GT 1030 computed 25% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 62.8906% of total work
-----------------------------------------
Iteration-99:
GeForce GT 1030 computed 24.6094% of total work
gfx1036 computed 12.1094% of total work
AMD Ryzen 9 7900 12-Core Processor computed 63.2812% of total work
-----------------------------------------

Gaussian Blur With 25x25 Sized Filter

Mandelbrot-set Generator 16000 x 16000 With Maximum 50 Iterations Per Pixel (56 milliseconds per generation, 2 minutes to create file)

Warning: this program creates a 2GB ppm file!

#include <iostream>
#include <fstream>

// uncomment this if you use opencl v2.0 or v3.0 devices. By default, opencl v1.2 devices are queried. 
// must be defined before including "gpgpu.hpp"
//#define CL_HPP_MINIMUM_OPENCL_VERSION 200

#include "gpgpu.hpp"
int main()
{
    try
    {
        constexpr size_t n = 1024*16;
        int clonesPerDevice = 2;
        GPGPU::Computer computer(GPGPU::Computer::DEVICE_ALL,GPGPU::Computer::DEVICE_SELECTION_ALL,clonesPerDevice);
        

        auto deviceNames = computer.deviceNames();
        for (auto d : deviceNames)
        {
            std::cout << d << std::endl;
        }

        std::cout << "-------------------------------------------" << std::endl;
        std::cout << "Starting compilation of kernel..." << std::endl;

        computer.compile(std::string(R"(

                // algorithm from: https://github.com/sessamekesh/IndigoCS_Mandelbrot/blob/master/main.cpp

                #define n )") + std::to_string(n) + std::string(R"(
                    

                    int findMandelbrot(float cr, float ci, int max_iterations)
                    {
	                    int i = 0;
	                    float zr = 0.0f, zi = 0.0f;
	                    while (i < max_iterations && zr * zr + zi * zi < 4.0f)
	                    {
		                    float temp = zr * zr - zi * zi + cr;
		                    zi = 2.0f * zr * zi + ci;
		                    zr = temp;
		                    i++;
	                    }

	                    return i;
                    }

                    float mapToReal(int x, int imageWidth, float minR, float maxR)
                    {
	                    float range = maxR - minR;
	                    return x * (range / imageWidth) + minR;
                    }

                    float mapToImaginary(int y, int imageHeight, float minI, float maxI)
                    {
	                    float range = maxI - minI;
	                    return y * (range / imageHeight) + minI;
                    }

                    void kernel mandelbrot(global unsigned char * b) 
                    {
                        // global id of this thread
                        const int id = get_global_id(0);
                        const int x = id % n;
                        const int y = id / n;
			            float cr = mapToReal(x, n, -1.5f, 0.7f);
			            float ci = mapToImaginary(y, n, -1.0f, 1.0f);


			            int mn = findMandelbrot(cr, ci, 50);

                        b[id] = mn;
                        
                    }
           )"), "mandelbrot");


        std::cout << "-------------------------------------------" << std::endl;
        std::cout << "Starting allocation of host buffer..." << std::endl;

        // create parameters of kernel (also allocated in each device)

        bool isBinput = false;
        bool isBoutput = true;
        int dataElementsPerThread = 1;

        GPGPU::HostParameter b = computer.createHostParameter<unsigned char>("b", n*n, dataElementsPerThread, isBinput, isBoutput, false);

        // init elements of parameters
        for (int i = 0; i < n*n; i++)
        {
            b.access<unsigned char>(i) = 0;
        }


        int repeat = 100;
        std::cout << "-------------------------------------------" << std::endl;
        std::cout << "Starting computation for "<< repeat <<" times..." << std::endl;


        size_t nano;
        
        std::vector<double> workloadRatios;
        {
            GPGPU::Bench bench(&nano);
            for (int i = 0; i < repeat; i++)
            {
                std::cout << i << std::endl;
                workloadRatios = computer.compute(b,"mandelbrot", 0, n*n , 256,true,1024*1024); // n*n total workitems, 256 local workitems, 1024*1024 load-balancing grain size
            }
        }
        std::cout << nano / 1000000000.0 << " seconds" << std::endl;

        size_t totalIter = 0;
        for (int i = 0; i < n * n; i++)
        {
            totalIter += 8/* from mapping functions */ + b.access<unsigned char>(i) ;
        }
        totalIter *= 10;
        totalIter *= repeat;

        std::cout << (((totalIter ) / (nano / 1000000000.0))/1000000000.0) << " gflops" << std::endl;
        for (int i = 0; i < deviceNames.size(); i++)
        {
            std::cout << deviceNames[i] << " has workload ratio of: " << workloadRatios[i] << std::endl;
        }


        std::cout << "Creating 2GB ppm file. This may take a few minutes." << std::endl;
        std::ofstream fout("output_image.ppm");
        fout << "P3" << std::endl; 
        fout << n << " " << n << std::endl; 
        fout << "255" << std::endl; 

        for (int i = 0; i < n; i++)
        {
            for (int j = 0; j < n; j++)
            {
                fout << (int) b.access<unsigned char>(i*n+j) << " " << (int)b.access<unsigned char>(i * n + j) << " " << (int)b.access<unsigned char>(i * n + j) << " ";
            }
            fout << std::endl;
        }
        fout.close();

    }
    catch (std::exception& ex)
    {
        std::cout << ex.what() << std::endl;
    }
    return 0;
}

result:

5.67168 seconds
1443.46 gflops
Device 0: GeForce GT 1030 (OpenCL 1.2 CUDA ) [direct-RAM-access disabled] has workload ratio of: 0.18633
Device 1: gfx1036 (OpenCL 2.0 AMD-APP (3444.0) )[has direct access to RAM] [direct-RAM-access disabled] has workload ratio of: 0.0428178
Device 2: AMD Ryzen 9 7900 12-Core Processor              (OpenCL 3.0 (Build 0) )[has direct access to RAM] has workload ratio of: 0.548504
Device 3: GeForce GT 1030 (OpenCL 1.2 CUDA ) [direct-RAM-access disabled] has workload ratio of: 0.179573
Device 4: gfx1036 (OpenCL 2.0 AMD-APP (3444.0) )[has direct access to RAM] [direct-RAM-access disabled] has workload ratio of: 0.0427749
Creating 2GB ppm file. This may take a few minutes.