Merge pull request #3 from nakul02/gpu_kernel_nj

some bug fixes
nakul02 · Sep 28, 2016 · 1509db7 · 1509db7
2 parents 09baf45 + c182b8c
commit 1509db7
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 15 deletions.
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -56,6 +56,7 @@
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
 import jcuda.jcusparse.cusparsePointerMode;
+import jcuda.runtime.JCuda;
 
 public class JCudaObject extends GPUObject {
 
@@ -150,17 +151,21 @@ public static long estimateSize(long nnz2, long rows) {
 		 * @throws DMLRuntimeException 
 		 */
 		public static CSRPointer allocateEmpty(long nnz2, long rows) throws DMLRuntimeException {
+			assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
 			CSRPointer r = new CSRPointer();
 			r.nnz = nnz2;
-			if(nnz2 != 0) {
-				ensureFreeSpace(Sizeof.DOUBLE * nnz2 + Sizeof.INT * (rows + 1) + Sizeof.INT * nnz2);
-				long t0 = System.nanoTime();
-				cudaMalloc(r.val, Sizeof.DOUBLE * nnz2);
-				cudaMalloc(r.rowPtr, Sizeof.INT * (rows + 1));
-				cudaMalloc(r.colInd, Sizeof.INT * nnz2);
-				Statistics.cudaAllocTime.addAndGet(System.nanoTime()-t0);
-				Statistics.cudaAllocCount.addAndGet(3);
+			if(nnz2 == 0) {
+				// The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
+				// with no memory allocated on the GPU.
+				return r;
 			}
+			ensureFreeSpace(Sizeof.DOUBLE * nnz2 + Sizeof.INT * (rows + 1) + Sizeof.INT * nnz2);
+			long t0 = System.nanoTime();
+			cudaMalloc(r.val, Sizeof.DOUBLE * nnz2);
+			cudaMalloc(r.rowPtr, Sizeof.INT * (rows + 1));
+			cudaMalloc(r.colInd, Sizeof.INT * nnz2);
+			Statistics.cudaAllocTime.addAndGet(System.nanoTime()-t0);
+			Statistics.cudaAllocCount.addAndGet(3);
 			return r;
 		}
 
@@ -363,6 +368,7 @@ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHan
 			Pointer A = JCudaObject.allocate(size);
 			// Note: cusparseDcsr2dense method cannot handle empty blocks
 			cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
+			JCuda.cudaDeviceSynchronize();
 			// int[] alpha = { 1 };
 			// int[] beta = { 1 };
 			// Pointer C = JCudaObject.allocate(size);
@@ -627,7 +633,11 @@ void copyFromHostToDevice()
 			int rowPtr[] = null;
 			int colInd[] = null;
 			double[] values = null;
-
+
+			tmp.recomputeNonZeros();
+			long nnz = tmp.getNonZeros();
+			mat.getMatrixCharacteristics().setNonZeros(nnz);
+
 			SparseBlock block = tmp.getSparseBlock();
 			boolean copyToDevice = true;
 			if(block == null && tmp.getNonZeros() == 0) {

diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysml.runtime.matrix.data;
 
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
+import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
@@ -47,6 +49,7 @@
 import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
 import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
 import static jcuda.runtime.JCuda.cudaFree;
 import static jcuda.runtime.JCuda.cudaMalloc;
 import static jcuda.runtime.JCuda.cudaMemcpy;
@@ -64,12 +67,12 @@
 import org.apache.sysml.runtime.functionobjects.GreaterThanEquals;
 import org.apache.sysml.runtime.functionobjects.LessThan;
 import org.apache.sysml.runtime.functionobjects.LessThanEquals;
+import org.apache.sysml.runtime.functionobjects.Minus;
+import org.apache.sysml.runtime.functionobjects.Multiply;
 import org.apache.sysml.runtime.functionobjects.Multiply2;
 import org.apache.sysml.runtime.functionobjects.NotEquals;
 import org.apache.sysml.runtime.functionobjects.Or;
 import org.apache.sysml.runtime.functionobjects.Plus;
-import org.apache.sysml.runtime.functionobjects.Minus;
-import org.apache.sysml.runtime.functionobjects.Multiply;
 import org.apache.sysml.runtime.functionobjects.Power;
 import org.apache.sysml.runtime.functionobjects.Power2;
 import org.apache.sysml.runtime.functionobjects.ValueFunction;
@@ -97,9 +100,6 @@
 import jcuda.jcudnn.cudnnTensorDescriptor;
 import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
-import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
 
 //FIXME move could to respective instructions, this is not a block library
 public class LibMatrixCUDA {
@@ -583,7 +583,7 @@ protected static void sparseMatrixDenseVectorMult(MatrixObject output, CSRPointe
 		double[] alpha = { 1 };
 		double[] beta = { 0 };
 		cusparseDcsrmv(cusparseHandle, transA, m, k, (int)A.nnz, Pointer.to(alpha), A.descr, A.val, A.rowPtr, A.colInd, B_dense, Pointer.to(beta), C_dense);
-
+		cudaDeviceSynchronize(); 	// Since cusparseDcsrmv is asynchronously executed
 		((JCudaObject)(output.getGPUObject())).setDenseMatrixCudaPointer(C_dense);
 		output.getGPUObject().setDeviceModify(size);
 	}