Massively Parallel GPU Computing with OpenCL in LWJGL

OpenCL is a standard for writing programs that execute in parallel across heterogeneous processors — GPUs, CPUs, FPGAs, and DSPs. You write compute kernels in OpenCL C (a dialect of C99), compile them at runtime, and dispatch thousands of work items concurrently. LWJGL 3 provides a complete Java binding that mirrors the C API, using MemoryStack for struct allocation and long handles for all OpenCL objects. This guide uses patterns from CLDemo.java in the LWJGL sample suite to show the full host-side workflow.

Platform and device model

OpenCL organises hardware into platforms (ICD implementations, such as NVIDIA OpenCL or Intel OpenCL) and devices within each platform (one or more GPUs, CPUs, or accelerators). You enumerate both at runtime:

GPU device
CPU device
All devices

// Target GPU devices only
checkCLError(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, null, pi));

// Target the CPU (useful for debugging — sequential, easy to print from)
checkCLError(clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, null, pi));

// Enumerate everything on the platform
checkCLError(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, null, pi));

Not all platforms provide GPU devices. If clGetDeviceIDs with CL_DEVICE_TYPE_GPU returns zero, fall back to CL_DEVICE_TYPE_CPU or CL_DEVICE_TYPE_ALL.

Get platforms and devices

LWJGL’s MemoryStack provides temporary IntBuffer and PointerBuffer allocations without heap pressure:

import static org.lwjgl.opencl.CL11.*;
import static org.lwjgl.system.MemoryStack.*;
import org.lwjgl.opencl.CL;
import org.lwjgl.opencl.CLCapabilities;

try (MemoryStack stack = stackPush()) {
    IntBuffer pi = stack.mallocInt(1);

    // Count platforms
    checkCLError(clGetPlatformIDs(null, pi));
    if (pi.get(0) == 0) {
        throw new RuntimeException("No OpenCL platforms found.");
    }

    // Fetch platform handles
    PointerBuffer platforms = stack.mallocPointer(pi.get(0));
    checkCLError(clGetPlatformIDs(platforms, (IntBuffer) null));

    long platform = platforms.get(0);

    // Build platform capabilities (needed before querying device extensions)
    CLCapabilities platformCaps = CL.createPlatformCapabilities(platform);

    // Count and fetch devices
    checkCLError(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, null, pi));
    PointerBuffer devices = stack.mallocPointer(pi.get(0));
    checkCLError(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, devices, (IntBuffer) null));

    long device = devices.get(0);
}

Useful device info queries after selecting a device:

// Maximum number of parallel compute units
System.out.println("CL_DEVICE_MAX_COMPUTE_UNITS = "
    + getDeviceInfoInt(device, CL_DEVICE_MAX_COMPUTE_UNITS));

// Maximum work-group size
System.out.println("CL_DEVICE_MAX_WORK_GROUP_SIZE = "
    + getDeviceInfoPointer(device, CL_DEVICE_MAX_WORK_GROUP_SIZE));

Create a context

The context ties together a device, a memory allocator, and an error callback:

IntBuffer errcode_ret = stack.callocInt(1);

// Context properties: specify which platform to use
PointerBuffer ctxProps = stack.mallocPointer(3);
ctxProps.put(0, CL_CONTEXT_PLATFORM)
        .put(1, platform)
        .put(2, 0);  // null-terminate the list

CLContextCallback contextCB = CLContextCallback.create(
    (errinfo, private_info, cb, user_data) -> {
        System.err.println("[OpenCL context error] " + memUTF8(errinfo));
    }
);

long context = clCreateContext(
    ctxProps, device, contextCB, NULL, errcode_ret
);
checkCLError(errcode_ret);

CLContextCallback is a native function pointer — free it with contextCB.free() after destroying the context.

Create a command queue

All operations (kernel dispatch, memory transfers) are enqueued on a command queue:

// OpenCL 2.0+ API (preferred)
PointerBuffer queueProps = stack.mallocPointer(3)
    .put(0, CL_QUEUE_PROPERTIES)
    .put(1, 0L)                  // no special properties (no out-of-order, no profiling)
    .put(2, 0L);                 // null-terminate

long queue = clCreateCommandQueueWithProperties(
    context, device, queueProps, errcode_ret
);
checkCLError(errcode_ret);

If you need to support OpenCL 1.x devices, use the deprecated clCreateCommandQueue(context, device, 0L, errcode_ret) instead.

Create and build a kernel program

OpenCL programs are compiled at runtime from OpenCL C source strings. The following kernel adds two float arrays element-wise — the canonical “hello world” of GPGPU computing:

// vector_add.cl — OpenCL C kernel
__kernel void vector_add(
    __global const float* a,
    __global const float* b,
    __global       float* result,
    const int count
) {
    int gid = get_global_id(0);
    if (gid < count) {
        result[gid] = a[gid] + b[gid];
    }
}

Compile from a Java string:

String kernelSource = "..."; // load from file or embed as a string constant

long program = clCreateProgramWithSource(context, kernelSource, errcode_ret);
checkCLError(errcode_ret);

int buildErr = clBuildProgram(program, device, "", null, NULL);
if (buildErr != CL_SUCCESS) {
    // Retrieve the build log on failure
    long[] logSize = new long[1];
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
        (ByteBuffer) null, logSize);
    ByteBuffer log = BufferUtils.createByteBuffer((int) logSize[0]);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log, null);
    throw new RuntimeException("Build failed:\n" + memASCII(log));
}

Create a kernel object

A kernel object wraps a single __kernel function from the compiled program:

long kernel = clCreateKernel(program, "vector_add", errcode_ret);
checkCLError(errcode_ret);

Set kernel arguments and dispatch

Arguments map to the __global parameters in the kernel signature. Device buffers are created with clCreateBuffer:

int N = 1024 * 1024; // one million elements

// Allocate device buffers
long bufA      = clCreateBuffer(context, CL_MEM_READ_ONLY,  N * Float.BYTES, errcode_ret);
long bufB      = clCreateBuffer(context, CL_MEM_READ_ONLY,  N * Float.BYTES, errcode_ret);
long bufResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * Float.BYTES, errcode_ret);

// Upload input data from a FloatBuffer
clEnqueueWriteBuffer(queue, bufA, true, 0, hostDataA, null, null);
clEnqueueWriteBuffer(queue, bufB, true, 0, hostDataB, null, null);

// Set kernel arguments (index matches __kernel parameter order)
clSetKernelArg1p(kernel, 0, bufA);
clSetKernelArg1p(kernel, 1, bufB);
clSetKernelArg1p(kernel, 2, bufResult);
clSetKernelArg1i(kernel, 3, N);

// Dispatch N work items, each handling one element
PointerBuffer globalWorkSize = stack.mallocPointer(1).put(0, N);
int err = clEnqueueNDRangeKernel(
    queue,
    kernel,
    1,               // work dimensions
    null,            // global work offset (null = {0})
    globalWorkSize,  // total work items
    null,            // local work size (null = let driver choose)
    null,            // wait list
    null             // event output
);
checkCLError(err);

Let the driver choose the local work-group size (pass null) for a first implementation. Tune it later with CL_DEVICE_MAX_WORK_GROUP_SIZE and profiling.

Read results back to the host

FloatBuffer result = BufferUtils.createFloatBuffer(N);

// true = blocking read — waits for the transfer to finish before returning
clEnqueueReadBuffer(queue, bufResult, true, 0, result, null, null);

// Ensure all previously enqueued commands have completed
clFinish(queue);

System.out.println("result[0] = " + result.get(0));

Use clEnqueueReadBuffer with blocking_read = false and an event output for asynchronous transfers when you want to overlap GPU computation with CPU work.

Clean up

Release objects in reverse order. OpenCL uses reference counting — clRelease* decrements the reference count and frees the object when it reaches zero.

clReleaseMemObject(bufA);
clReleaseMemObject(bufB);
clReleaseMemObject(bufResult);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
contextCB.free();

CLCapabilities

CL.createPlatformCapabilities and CL.createDeviceCapabilities return CLCapabilities objects that expose feature flags for every extension:

CLCapabilities caps = CL.createDeviceCapabilities(device, platformCaps);

if (caps.OpenCL20) {
    // Use clCreateCommandQueueWithProperties, SVM, pipes, etc.
}
if (caps.cl_khr_fp64) {
    // Use double-precision arithmetic in kernels
}

Always check the relevant capability flag before calling extension functions. Calling an unsupported function will cause a native crash or undefined behaviour.

Complete vector addition example

__kernel void vector_add(
    __global const float* a,
    __global const float* b,
    __global       float* result,
    const int count
) {
    int gid = get_global_id(0);
    if (gid < count) {
        result[gid] = a[gid] + b[gid];
    }
}

Get Started

Core Concepts

Guides

Bindings Reference

Contributing & Support

Massively Parallel GPU Computing with OpenCL in LWJGL

Platform and device model

CLCapabilities

Complete vector addition example

Build docs developers (and LLMs) love

Get Started

Core Concepts

Guides

Bindings Reference

Contributing & Support

Documentation Index

​Platform and device model

​CLCapabilities

​Complete vector addition example

Build docs developers (and LLMs) love

Platform and device model

CLCapabilities

Complete vector addition example