Skip to content

A Tutorial on how to run a simple opencl kernel from Java using the Java Native Interface on MacOS.

Notifications You must be signed in to change notification settings


Folders and files

Last commit message
Last commit date

Latest commit



5 Commits

Repository files navigation

Java OpenCL Tutorial MacOS (2018)

A Tutorial on how to utilize a GPU from java with OpenCL.


This tutorial requires the completion of my JNI Tutorial found here:
  1. Editing the C++ Code for OpenCL
  2. Changing the Compilation Configuration

C++ Code

Replace the code in the HelloWorld.cpp with:

#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#include <ctime>
#include <jni.h>
#include <iostream>
#include "HelloWorld.h"
using namespace std;

const char *KernelSource = "\n" \
                           "__kernel void square(                                                  \n" \
                           "   __global float* input,                                              \n" \
                           "   __global float* output,                                             \n" \
                           "   const unsigned int count)                                           \n" \
                           "{                                                                      \n" \
                           " __private float temp;                                                 \n" \
                           "   int i = get_global_id(0);                                           \n" \
                           "   if(i < count)                                                       \n" \
                           "    temp = atan(cos(sin(input[i])));                                   \n" \
                           "	for(int i = 0; i < 1000; i++){                                     \n" \
                           "		temp = atan(cos(sin(temp)));                               \n" \
                           "	}                                                                  \n" \
                           "	output[i] = atan(cos(sin(temp)));                                  \n" \
                           "                                                                       \n" \
                           "}                                                                      \n" \

Java_HelloWorld_print(JNIEnv *, jobject){
        const long long DATA_SIZE = 10000;
        int err;                    // error code returned from api calls

        float data[DATA_SIZE];      // original data set given to device
        float results[DATA_SIZE];   // results returned from device
        unsigned int correct;       // number of correct results returned

        size_t global;              // global domain size for our calculation
        size_t local;               // local domain size for our calculation

        cl_device_id device_id;     // compute device id
        cl_context context;         // compute context
        cl_command_queue commands;  // compute command queue
        cl_program program;         // compute program
        cl_kernel kernel;           // compute kernel

        cl_mem input;               // device memory used for the input array
        cl_mem output;              // device memory used for the output array

        // Fill our data set with random float values
        int i = 0;
        unsigned int count = DATA_SIZE;
        for(i = 0; i < count; i++)
                data[i] = rand()%100;

        // Connect to a compute device
        int gpu = 1;
        err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
        if (err != CL_SUCCESS)
                printf("Error: Failed to create a device group!\n");

        // Create a compute context
        context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
        if (!context)
                printf("Error: Failed to create a compute context!\n");

        // Create a command commands
        commands = clCreateCommandQueue(context, device_id, 0, &err);
        if (!commands)
                printf("Error: Failed to create a command commands!\n");

        // Create the compute program from the source buffer
        program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &err);
        if (!program)
                printf("Error: Failed to create compute program!\n");

        // Build the program executable
        err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
        if (err != CL_SUCCESS)
                size_t len;
                char buffer[2048];

                printf("Error: Failed to build program executable!\n");
                clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
                printf("%s\n", buffer);

        // Create the compute kernel in the program we wish to run
        kernel = clCreateKernel(program, "square", &err);
        if (!kernel || err != CL_SUCCESS)
                printf("Error: Failed to create compute kernel!\n");

        // Create the input and output arrays in device memory for our calculation
        input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
        output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
        if (!input || !output)
                printf("Error: Failed to allocate device memory!\n");

        // Write our data set into the input array in device memory
        err = clEnqueueWriteBuffer(commands, input, CL_FALSE, 0, sizeof(float) * count, data, 0, NULL, NULL);
        if (err != CL_SUCCESS)
                printf("Error: Failed to write to source array!\n");

        // Set the arguments to our compute kernel
        err = 0;
        err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
        err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
        if (err != CL_SUCCESS)
                printf("Error: Failed to set kernel arguments! %d\n", err);

        // Get the maximum work group size for executing the kernel on the device
        err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
        if (err != CL_SUCCESS)
                printf("Error: Failed to retrieve kernel work group info! %d\n", err);

        // Execute the kernel over the entire range of our 1d input data set
        // using the maximum number of work group items for this device

        global = DATA_SIZE;
        local = 1;

        std::clock_t start;
        double duration1;
        double duration;
        start = std::clock();

        err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
        if (err)
                printf("Error: Failed to execute kernel!\n %d", err);

        // Wait for the command commands to get serviced before reading back results
        err = clFinish(commands);

        if (err != CL_SUCCESS)
                printf("Error: Failed to execute kernel!\n %d", err);

        duration1 = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;

        // Read back the results from the device to verify the output
        err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
        if (err != CL_SUCCESS)
                printf("Error: Failed to read output array! %d\n", err);

        // Validate our results
        correct = 0;
        start = std::clock();

        printf("GPU Duration %f\n", duration1);

        for(i = 0; i < count; i++)
                float temp = atanf(cos(sin(data[i])));
                for(int j = 0; j < 1000; j++) {
                        temp = atanf(cos(sin(temp)));
                float result = atanf(cos(sin(temp)));

                if(fabs(results[i] - result) < 0.0001) {

        duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
        printf("CPU Duration %f\n", duration);

        // Print a brief summary detailing the results
        printf("Computed '%d/%d' correct values!\n", correct, count);

        // Shutdown and cleanup


Changing the Compilation Configuration

To run the new program, we must change line 6 in the file to be:

gcc -I"$JAVA_HOME/include" -I"$JAVA_HOME/include/darwin/"  -o HelloWorld.o -shared HelloWorld.cpp -framework OpenCL

To compile and run the programs, run the command:


The output should be similar to:

GPU Duration 0.000686
CPU Duration 0.396429
Computed '10000/10000' correct values!


A Tutorial on how to run a simple opencl kernel from Java using the Java Native Interface on MacOS.







No releases published


No packages published