Skip to content

A Tutorial on how to run a simple opencl kernel from Java using the Java Native Interface on MacOS.

Notifications You must be signed in to change notification settings

kylemarino22/Java-Opencl

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

5 Commits
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Java OpenCL Tutorial MacOS (2018)

A Tutorial on how to utilize a GPU from java with OpenCL.

Steps

This tutorial requires the completion of my JNI Tutorial found here: https://github.com/kylemarino22/JNITutorial
  1. Editing the C++ Code for OpenCL
  2. Changing the Compilation Configuration

C++ Code

Replace the code in the HelloWorld.cpp with:

#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#include <ctime>
#include <jni.h>
#include <iostream>
#include "HelloWorld.h"
using namespace std;

const char *KernelSource = "\n" \
                           "__kernel void square(                                                  \n" \
                           "   __global float* input,                                              \n" \
                           "   __global float* output,                                             \n" \
                           "   const unsigned int count)                                           \n" \
                           "{                                                                      \n" \
                           " __private float temp;                                                 \n" \
                           "   int i = get_global_id(0);                                           \n" \
                           "   if(i < count)                                                       \n" \
                           "    temp = atan(cos(sin(input[i])));                                   \n" \
                           "	for(int i = 0; i < 1000; i++){                                     \n" \
                           "		temp = atan(cos(sin(temp)));                               \n" \
                           "	}                                                                  \n" \
                           "	output[i] = atan(cos(sin(temp)));                                  \n" \
                           "                                                                       \n" \
                           "}                                                                      \n" \
                           "\n";





JNIEXPORT void JNICALL
Java_HelloWorld_print(JNIEnv *, jobject){
        const long long DATA_SIZE = 10000;
        int err;                    // error code returned from api calls

        float data[DATA_SIZE];      // original data set given to device
        float results[DATA_SIZE];   // results returned from device
        unsigned int correct;       // number of correct results returned

        size_t global;              // global domain size for our calculation
        size_t local;               // local domain size for our calculation

        cl_device_id device_id;     // compute device id
        cl_context context;         // compute context
        cl_command_queue commands;  // compute command queue
        cl_program program;         // compute program
        cl_kernel kernel;           // compute kernel

        cl_mem input;               // device memory used for the input array
        cl_mem output;              // device memory used for the output array

        // Fill our data set with random float values
        int i = 0;
        unsigned int count = DATA_SIZE;
        for(i = 0; i < count; i++)
                data[i] = rand()%100;



        // Connect to a compute device
        int gpu = 1;
        err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
        if (err != CL_SUCCESS)
        {
                printf("Error: Failed to create a device group!\n");
        }


        // Create a compute context
        context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
        if (!context)
        {
                printf("Error: Failed to create a compute context!\n");
        }


        // Create a command commands
        commands = clCreateCommandQueue(context, device_id, 0, &err);
        if (!commands)
        {
                printf("Error: Failed to create a command commands!\n");
        }

        // Create the compute program from the source buffer
        program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &err);
        if (!program)
        {
                printf("Error: Failed to create compute program!\n");
        }

        // Build the program executable
        err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
        if (err != CL_SUCCESS)
        {
                size_t len;
                char buffer[2048];

                printf("Error: Failed to build program executable!\n");
                clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
                printf("%s\n", buffer);
                exit(1);
        }

        // Create the compute kernel in the program we wish to run
        kernel = clCreateKernel(program, "square", &err);
        if (!kernel || err != CL_SUCCESS)
        {
                printf("Error: Failed to create compute kernel!\n");
                exit(1);
        }

        // Create the input and output arrays in device memory for our calculation
        input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
        output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
        if (!input || !output)
        {
                printf("Error: Failed to allocate device memory!\n");
                exit(1);
        }

        // Write our data set into the input array in device memory
        err = clEnqueueWriteBuffer(commands, input, CL_FALSE, 0, sizeof(float) * count, data, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
                printf("Error: Failed to write to source array!\n");
                exit(1);
        }

        // Set the arguments to our compute kernel
        err = 0;
        err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
        err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
        if (err != CL_SUCCESS)
        {
                printf("Error: Failed to set kernel arguments! %d\n", err);
                exit(1);
        }

        // Get the maximum work group size for executing the kernel on the device
        err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
        if (err != CL_SUCCESS)
        {
                printf("Error: Failed to retrieve kernel work group info! %d\n", err);
                exit(1);
        }

        // Execute the kernel over the entire range of our 1d input data set
        // using the maximum number of work group items for this device

        global = DATA_SIZE;
        local = 1;

        std::clock_t start;
        double duration1;
        double duration;
        start = std::clock();

        err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
        if (err)
        {
                printf("Error: Failed to execute kernel!\n %d", err);
        }

        // Wait for the command commands to get serviced before reading back results
        err = clFinish(commands);

        if (err != CL_SUCCESS)
        {
                printf("Error: Failed to execute kernel!\n %d", err);
        }


        duration1 = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;


        // Read back the results from the device to verify the output
        err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
        if (err != CL_SUCCESS)
        {
                printf("Error: Failed to read output array! %d\n", err);
                exit(1);
        }

        // Validate our results
        correct = 0;
        start = std::clock();

        printf("GPU Duration %f\n", duration1);

        for(i = 0; i < count; i++)
        {
                float temp = atanf(cos(sin(data[i])));
                for(int j = 0; j < 1000; j++) {
                        temp = atanf(cos(sin(temp)));
                }
                float result = atanf(cos(sin(temp)));

                if(fabs(results[i] - result) < 0.0001) {
                        correct++;
                }
        }

        duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
        printf("CPU Duration %f\n", duration);

        // Print a brief summary detailing the results
        printf("Computed '%d/%d' correct values!\n", correct, count);

        // Shutdown and cleanup
        clReleaseMemObject(input);
        clReleaseMemObject(output);
        clReleaseProgram(program);
        clReleaseKernel(kernel);
        clReleaseCommandQueue(commands);
        clReleaseContext(context);

        printf("Success\n");
}

Changing the Compilation Configuration

To run the new program, we must change line 6 in the build.sh file to be:

gcc -I"$JAVA_HOME/include" -I"$JAVA_HOME/include/darwin/"  -o HelloWorld.o -shared HelloWorld.cpp -framework OpenCL

To compile and run the programs, run the command:

./build.sh

The output should be similar to:

GPU Duration 0.000686
CPU Duration 0.396429
Computed '10000/10000' correct values!
Success

About

A Tutorial on how to run a simple opencl kernel from Java using the Java Native Interface on MacOS.

Topics

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published