diff --git a/kernels/cuda/matrixVectorMultiplicationKernel.cu b/kernels/cuda/matrixVectorMultiplicationKernel.cu new file mode 100644 index 0000000..10c90f2 --- /dev/null +++ b/kernels/cuda/matrixVectorMultiplicationKernel.cu @@ -0,0 +1,11 @@ +extern "C" +__global__ void matrixVectorMultiplicationKernel(double *matrix, double *vector, double *result, int numRows, int numCols) { + int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < numRows) { + double sum = 0.0; + for (int col = 0; col < numCols; ++col) { + sum += matrix[row * numCols + col] * vector[col]; + } + result[row] = sum; + } +} diff --git a/lib/cuda_kernels/matrixVectorMultiplicationKernel.ptx b/lib/cuda_kernels/matrixVectorMultiplicationKernel.ptx new file mode 100644 index 0000000..4457dcc --- /dev/null +++ b/lib/cuda_kernels/matrixVectorMultiplicationKernel.ptx @@ -0,0 +1,112 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-31833905 +// Cuda compilation tools, release 11.8, V11.8.89 +// Based on NVVM 7.0.1 +// + +.version 7.8 +.target sm_52 +.address_size 64 + + // .globl matrixVectorMultiplicationKernel + +.visible .entry matrixVectorMultiplicationKernel( + .param .u64 matrixVectorMultiplicationKernel_param_0, + .param .u64 matrixVectorMultiplicationKernel_param_1, + .param .u64 matrixVectorMultiplicationKernel_param_2, + .param .u32 matrixVectorMultiplicationKernel_param_3, + .param .u32 matrixVectorMultiplicationKernel_param_4 +) +{ + .reg .pred %p<7>; + .reg .b32 %r<25>; + .reg .f64 %fd<30>; + .reg .b64 %rd<28>; + + + ld.param.u64 %rd15, [matrixVectorMultiplicationKernel_param_0]; + ld.param.u64 %rd16, [matrixVectorMultiplicationKernel_param_1]; + ld.param.u64 %rd14, [matrixVectorMultiplicationKernel_param_2]; + ld.param.u32 %r12, [matrixVectorMultiplicationKernel_param_3]; + ld.param.u32 %r11, [matrixVectorMultiplicationKernel_param_4]; + cvta.to.global.u64 %rd1, %rd16; + cvta.to.global.u64 %rd2, %rd15; + mov.u32 %r13, %ntid.x; + mov.u32 %r14, %ctaid.x; + mov.u32 %r15, %tid.x; + mad.lo.s32 %r1, %r14, %r13, %r15; + setp.ge.s32 %p1, %r1, %r12; + @%p1 bra $L__BB0_9; + + setp.lt.s32 %p2, %r11, 1; + mov.f64 %fd29, 0d0000000000000000; + @%p2 bra $L__BB0_8; + + add.s32 %r17, %r11, -1; + and.b32 %r24, %r11, 3; + setp.lt.u32 %p3, %r17, 3; + mov.f64 %fd29, 0d0000000000000000; + mov.u32 %r23, 0; + @%p3 bra $L__BB0_5; + + sub.s32 %r22, %r11, %r24; + mul.lo.s32 %r19, %r11, %r1; + mul.wide.s32 %rd17, %r19, 8; + add.s64 %rd18, %rd2, %rd17; + add.s64 %rd25, %rd18, 16; + mov.u64 %rd24, %rd1; + +$L__BB0_4: + ld.global.f64 %fd12, [%rd24]; + ld.global.f64 %fd13, [%rd25+-16]; + fma.rn.f64 %fd14, %fd13, %fd12, %fd29; + ld.global.f64 %fd15, [%rd24+8]; + ld.global.f64 %fd16, [%rd25+-8]; + fma.rn.f64 %fd17, %fd16, %fd15, %fd14; + ld.global.f64 %fd18, [%rd24+16]; + ld.global.f64 %fd19, [%rd25]; + fma.rn.f64 %fd20, %fd19, %fd18, %fd17; + ld.global.f64 %fd21, [%rd24+24]; + ld.global.f64 %fd22, [%rd25+8]; + fma.rn.f64 %fd29, %fd22, %fd21, %fd20; + add.s32 %r23, %r23, 4; + add.s64 %rd25, %rd25, 32; + add.s64 %rd24, %rd24, 32; + add.s32 %r22, %r22, -4; + setp.ne.s32 %p4, %r22, 0; + @%p4 bra $L__BB0_4; + +$L__BB0_5: + setp.eq.s32 %p5, %r24, 0; + @%p5 bra $L__BB0_8; + + mul.wide.s32 %rd19, %r23, 8; + add.s64 %rd27, %rd1, %rd19; + mad.lo.s32 %r20, %r11, %r1, %r23; + mul.wide.s32 %rd20, %r20, 8; + add.s64 %rd26, %rd2, %rd20; + +$L__BB0_7: + .pragma "nounroll"; + ld.global.f64 %fd23, [%rd27]; + ld.global.f64 %fd24, [%rd26]; + fma.rn.f64 %fd29, %fd24, %fd23, %fd29; + add.s64 %rd27, %rd27, 8; + add.s64 %rd26, %rd26, 8; + add.s32 %r24, %r24, -1; + setp.ne.s32 %p6, %r24, 0; + @%p6 bra $L__BB0_7; + +$L__BB0_8: + cvta.to.global.u64 %rd21, %rd14; + mul.wide.s32 %rd22, %r1, 8; + add.s64 %rd23, %rd21, %rd22; + st.global.f64 [%rd23], %fd29; + +$L__BB0_9: + ret; + +} + diff --git a/lib/src/main/java/de/edux/core/math/matrix/cuda/CUDAKernelUser.java b/lib/src/main/java/de/edux/core/math/matrix/cuda/CUDAKernelUser.java new file mode 100644 index 0000000..97435fc --- /dev/null +++ b/lib/src/main/java/de/edux/core/math/matrix/cuda/CUDAKernelUser.java @@ -0,0 +1,7 @@ +package de.edux.core.math.matrix.cuda; + +import jcuda.driver.CUfunction; + +public interface CUDAKernelUser { + CUfunction loadKernel(); +} diff --git a/lib/src/main/java/de/edux/core/math/matrix/cuda/CudaMatrixArithmetic.java b/lib/src/main/java/de/edux/core/math/matrix/cuda/CudaMatrixArithmetic.java index fa563fa..1fbdfab 100644 --- a/lib/src/main/java/de/edux/core/math/matrix/cuda/CudaMatrixArithmetic.java +++ b/lib/src/main/java/de/edux/core/math/matrix/cuda/CudaMatrixArithmetic.java @@ -64,11 +64,27 @@ private void printCudaDeviceInformation() { @Override public double[][] multiply(double[][] matrixA, double[][] matrixB) { + if (matrixA == null || matrixB == null) { + throw new IllegalArgumentException("Matrices must not be null."); + } + if (matrixA.length == 0 || matrixB.length == 0) { + throw new IllegalArgumentException("Matrices must not be empty."); + } + if (matrixA[0].length != matrixB.length) { + throw new IllegalArgumentException("Matrix A columns must match Matrix B rows."); + } + return matrixProduct.multiply(matrixA, matrixB); } @Override public double[] multiply(double[][] matrix, double[] vector) { + if (matrix.length == 0 || matrix[0].length == 0 || vector.length == 0) { + throw new IllegalArgumentException("Matrix and vector must not be empty."); + } + if (matrix[0].length != vector.length) { + throw new IllegalArgumentException("Matrix columns and vector size do not match."); + } return matrixVectorProduct.multiply(matrix, vector); } } diff --git a/lib/src/main/java/de/edux/core/math/matrix/cuda/operations/CudaMatrixProduct.java b/lib/src/main/java/de/edux/core/math/matrix/cuda/operations/CudaMatrixProduct.java index 6f1dedb..3844d65 100644 --- a/lib/src/main/java/de/edux/core/math/matrix/cuda/operations/CudaMatrixProduct.java +++ b/lib/src/main/java/de/edux/core/math/matrix/cuda/operations/CudaMatrixProduct.java @@ -4,13 +4,14 @@ import static jcuda.driver.JCudaDriver.cuMemFree; import de.edux.core.math.IMatrixProduct; +import de.edux.core.math.matrix.cuda.CUDAKernelUser; import java.io.File; import java.util.List; import jcuda.Pointer; import jcuda.Sizeof; import jcuda.driver.*; -public class CudaMatrixProduct implements IMatrixProduct { +public class CudaMatrixProduct implements IMatrixProduct, CUDAKernelUser { static { JCudaDriver.setExceptionsEnabled(true); @@ -31,12 +32,7 @@ public double[][] multiply(double[][] matrixA, double[][] matrixB) { throw new IllegalArgumentException("Inner dimensions do not match."); } - String ptxFileName = preparePtxFile(); - - CUmodule module = new CUmodule(); - cuModuleLoad(module, ptxFileName); - CUfunction function = new CUfunction(); - cuModuleGetFunction(function, module, "matrixMultiply"); + CUfunction function = loadKernel(); double[] hostInputA = flatten(matrixA); double[] hostInputB = flatten(matrixB); @@ -113,7 +109,14 @@ private double[] flatten(double[][] matrix) { return flat; } - private String preparePtxFile() { - return "cuda_kernels" + File.separator + "matrixMultiplicationKernel.ptx"; + // CUDAKernelUser + @Override + public CUfunction loadKernel() { + String ptxFileName = "cuda_kernels" + File.separator + "matrixMultiplicationKernel.ptx"; + CUmodule module = new CUmodule(); + cuModuleLoad(module, ptxFileName); + CUfunction function = new CUfunction(); + cuModuleGetFunction(function, module, "matrixMultiply"); + return function; } } diff --git a/lib/src/main/java/de/edux/core/math/matrix/cuda/operations/CudaMatrixVectorProduct.java b/lib/src/main/java/de/edux/core/math/matrix/cuda/operations/CudaMatrixVectorProduct.java index 73cd151..76e111e 100644 --- a/lib/src/main/java/de/edux/core/math/matrix/cuda/operations/CudaMatrixVectorProduct.java +++ b/lib/src/main/java/de/edux/core/math/matrix/cuda/operations/CudaMatrixVectorProduct.java @@ -1,10 +1,110 @@ package de.edux.core.math.matrix.cuda.operations; +import static jcuda.driver.JCudaDriver.*; +import static jcuda.driver.JCudaDriver.cuMemFree; + import de.edux.core.math.IMatrixVectorProduct; +import de.edux.core.math.matrix.cuda.CUDAKernelUser; +import java.io.File; +import java.util.List; +import jcuda.Pointer; +import jcuda.Sizeof; +import jcuda.driver.*; +import jcuda.driver.CUfunction; + +public class CudaMatrixVectorProduct implements IMatrixVectorProduct, CUDAKernelUser { + + static { + JCudaDriver.setExceptionsEnabled(true); + cuInit(0); + CUdevice device = new CUdevice(); + cuDeviceGet(device, 0); + CUcontext context = new CUcontext(); + cuCtxCreate(context, 0, device); + } -public class CudaMatrixVectorProduct implements IMatrixVectorProduct { @Override public double[] multiply(double[][] matrix, double[] vector) { - return new double[0]; + int numRows = matrix.length; + int numCols = matrix[0].length; + + if (numCols != vector.length) { + throw new IllegalArgumentException("Matrix columns and vector size do not match."); + } + + CUfunction function = loadKernel(); + + double[] hostMatrix = flatten(matrix); + double[] hostVector = vector.clone(); + double[] hostOutput = new double[numRows]; + + CUdeviceptr deviceMatrix = new CUdeviceptr(); + cuMemAlloc(deviceMatrix, hostMatrix.length * Sizeof.DOUBLE); + cuMemcpyHtoD(deviceMatrix, Pointer.to(hostMatrix), hostMatrix.length * Sizeof.DOUBLE); + + CUdeviceptr deviceVector = new CUdeviceptr(); + cuMemAlloc(deviceVector, hostVector.length * Sizeof.DOUBLE); + cuMemcpyHtoD(deviceVector, Pointer.to(hostVector), hostVector.length * Sizeof.DOUBLE); + + CUdeviceptr deviceOutput = new CUdeviceptr(); + cuMemAlloc(deviceOutput, hostOutput.length * Sizeof.DOUBLE); + + Pointer kernelParameters = + Pointer.to( + Pointer.to(deviceMatrix), + Pointer.to(deviceVector), + Pointer.to(deviceOutput), + Pointer.to(new int[] {numRows}), + Pointer.to(new int[] {numCols})); + + int blockSize = 256; // This should be tuned according to your hardware capability + int gridSize = (int) Math.ceil((double) numRows / blockSize); + cuLaunchKernel( + function, + gridSize, + 1, + 1, // Grid dimension + blockSize, + 1, + 1, // Block dimension + 0, + null, // Shared memory size and stream + kernelParameters, + null // Kernel- and extra parameters + ); + cuCtxSynchronize(); + + cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, hostOutput.length * Sizeof.DOUBLE); + + List list = List.of(deviceMatrix, deviceVector, deviceOutput); + cleanUp(list); + + return hostOutput; + } + + private void cleanUp(List devicePtrs) { + for (CUdeviceptr devicePtr : devicePtrs) { + cuMemFree(devicePtr); + } + } + + private double[] flatten(double[][] matrix) { + int rows = matrix.length; + int cols = matrix[0].length; + double[] flat = new double[rows * cols]; + for (int i = 0; i < rows; i++) { + System.arraycopy(matrix[i], 0, flat, i * cols, cols); + } + return flat; + } + + @Override + public CUfunction loadKernel() { + String ptxFileName = "cuda_kernels" + File.separator + "matrixVectorMultiplicationKernel.ptx"; + CUmodule module = new CUmodule(); + cuModuleLoad(module, ptxFileName); + CUfunction function = new CUfunction(); + cuModuleGetFunction(function, module, "matrixVectorMultiplicationKernel"); + return function; } } diff --git a/lib/src/test/java/de/edux/core/math/matrix/cuda/CudaMatrixMatrixArithmeticTest.java b/lib/src/test/java/de/edux/core/math/matrix/cuda/CudaMatrixMatrixArithmeticTest.java index 01c658a..6d9d5ad 100644 --- a/lib/src/test/java/de/edux/core/math/matrix/cuda/CudaMatrixMatrixArithmeticTest.java +++ b/lib/src/test/java/de/edux/core/math/matrix/cuda/CudaMatrixMatrixArithmeticTest.java @@ -1,12 +1,8 @@ package de.edux.core.math.matrix.cuda; -import static jcuda.driver.JCudaDriver.cuModuleLoad; import static org.junit.jupiter.api.Assertions.*; import de.edux.core.math.IMatrixArithmetic; -import java.io.File; -import jcuda.driver.CUmodule; -import jcuda.driver.CUresult; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -14,20 +10,15 @@ @Disabled class CudaMatrixMatrixArithmeticTest { + private static CudaMatrixArithmetic cudaArithmetic; + @BeforeAll static void setUp() { - String ptxFileName = "cuda_kernels" + File.separator + "matrixMultiplicationKernel.ptx"; - CUmodule module = new CUmodule(); - int result = cuModuleLoad(module, ptxFileName); - if (result != CUresult.CUDA_SUCCESS) { - System.out.println("Could not load module: " + result); - } else { - System.out.println("CUDA Module loaded successfully"); - } + cudaArithmetic = new CudaMatrixArithmetic(); } @Test - void multiply() { + void shouldMultiplyMatrices() { int matrixSize = 2048; double[][] matrixA = new double[matrixSize][matrixSize]; double[][] matrixB = new double[matrixSize][matrixSize]; @@ -39,9 +30,7 @@ void multiply() { } } - IMatrixArithmetic cudaMatrix = new CudaMatrixArithmetic(); - - double[][] resultMatrix = cudaMatrix.multiply(matrixA, matrixB); + double[][] resultMatrix = cudaArithmetic.multiply(matrixA, matrixB); for (int i = 0; i < matrixSize; i++) { for (int j = 0; j < matrixSize; j++) { @@ -84,8 +73,7 @@ public void shouldMultiply8x8MatricesCorrectly() { {22, 18, 14, 10, 22, 18, 14, 10} }; - IMatrixArithmetic cudaMatrixMultiplication = new CudaMatrixArithmetic(); - double[][] result = cudaMatrixMultiplication.multiply(matrixA, matrixB); + double[][] result = cudaArithmetic.multiply(matrixA, matrixB); assertArrayEquals( expected, result, "The 8x8 matrix multiplication did not yield the correct result."); @@ -113,7 +101,7 @@ public void shouldMultiplyWithZeroMatrix() { } @Test - public void shouldMultiplyWithIdentityMatrix() { + public void shouldMultiplyMatrixWithIdentityMatrix() { double[][] matrixA = { {1, 0}, {0, 1} @@ -123,8 +111,7 @@ public void shouldMultiplyWithIdentityMatrix() { {7, 8} }; - IMatrixArithmetic cudaMatrixMultiplication = new CudaMatrixArithmetic(); - double[][] result = cudaMatrixMultiplication.multiply(matrixA, matrixB); + double[][] result = cudaArithmetic.multiply(matrixA, matrixB); assertArrayEquals(matrixB, result); } @@ -144,9 +131,58 @@ public void shouldMultiplySmallMatricesCorrectly() { {10, 8} }; - IMatrixArithmetic cudaMatrixMultiplication = new CudaMatrixArithmetic(); - double[][] result = cudaMatrixMultiplication.multiply(matrixA, matrixB); + double[][] result = cudaArithmetic.multiply(matrixA, matrixB); assertArrayEquals(expected, result); } + + @Test + void shouldSolveMatrixVectorProduct() { + int matrixSize = 2048; + double[][] matrixA = new double[matrixSize][matrixSize]; + double[] vector = new double[matrixSize]; + + for (int i = 0; i < matrixSize; i++) { + for (int j = 0; j < matrixSize; j++) { + matrixA[i][j] = 1; + vector[i] = 1; + } + } + + double[] resultVector = cudaArithmetic.multiply(matrixA, vector); + + for (int i = 0; i < matrixSize; i++) { + assertEquals(matrixSize, resultVector[i], "Result on [" + i + "][" + i + "] not correct."); + } + } + + @Test + void shouldHandleEmptyMatrix() { + double[][] matrix = new double[0][0]; + double[] vector = new double[0]; + assertThrows(IllegalArgumentException.class, () -> cudaArithmetic.multiply(matrix, vector)); + } + + @Test + void shouldHandleMismatchedSizes() { + double[][] matrix = {{1, 2, 3}, {4, 5, 6}}; + double[] vector = {1, 2}; + assertThrows(IllegalArgumentException.class, () -> cudaArithmetic.multiply(matrix, vector)); + } + + @Test + void shouldHandleNullMatrix() { + double[] vector = {1, 2, 3}; + assertThrows(NullPointerException.class, () -> cudaArithmetic.multiply(null, vector)); + } + + @Test + void shouldMultiplyVectorWithIdentityMatrix() { + double[][] matrix = {{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}; + double[] vector = {1, 2, 3}; + double[] expected = {1, 2, 3}; + double[] result = cudaArithmetic.multiply(matrix, vector); + assertArrayEquals( + expected, result, "Multiplying with identity matrix should return the original vector."); + } }