Skip to content

Commit

Permalink
Merge pull request #3 from nakul02/gpu_kernel_nj
Browse files Browse the repository at this point in the history
some bug fixes
  • Loading branch information
niketanpansare authored Sep 28, 2016
2 parents 09baf45 + c182b8c commit 1509db7
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import jcuda.jcusparse.cusparseHandle;
import jcuda.jcusparse.cusparseMatDescr;
import jcuda.jcusparse.cusparsePointerMode;
import jcuda.runtime.JCuda;

public class JCudaObject extends GPUObject {

Expand Down Expand Up @@ -150,17 +151,21 @@ public static long estimateSize(long nnz2, long rows) {
* @throws DMLRuntimeException
*/
public static CSRPointer allocateEmpty(long nnz2, long rows) throws DMLRuntimeException {
assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
CSRPointer r = new CSRPointer();
r.nnz = nnz2;
if(nnz2 != 0) {
ensureFreeSpace(Sizeof.DOUBLE * nnz2 + Sizeof.INT * (rows + 1) + Sizeof.INT * nnz2);
long t0 = System.nanoTime();
cudaMalloc(r.val, Sizeof.DOUBLE * nnz2);
cudaMalloc(r.rowPtr, Sizeof.INT * (rows + 1));
cudaMalloc(r.colInd, Sizeof.INT * nnz2);
Statistics.cudaAllocTime.addAndGet(System.nanoTime()-t0);
Statistics.cudaAllocCount.addAndGet(3);
if(nnz2 == 0) {
// The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
// with no memory allocated on the GPU.
return r;
}
ensureFreeSpace(Sizeof.DOUBLE * nnz2 + Sizeof.INT * (rows + 1) + Sizeof.INT * nnz2);
long t0 = System.nanoTime();
cudaMalloc(r.val, Sizeof.DOUBLE * nnz2);
cudaMalloc(r.rowPtr, Sizeof.INT * (rows + 1));
cudaMalloc(r.colInd, Sizeof.INT * nnz2);
Statistics.cudaAllocTime.addAndGet(System.nanoTime()-t0);
Statistics.cudaAllocCount.addAndGet(3);
return r;
}

Expand Down Expand Up @@ -363,6 +368,7 @@ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHan
Pointer A = JCudaObject.allocate(size);
// Note: cusparseDcsr2dense method cannot handle empty blocks
cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
JCuda.cudaDeviceSynchronize();
// int[] alpha = { 1 };
// int[] beta = { 1 };
// Pointer C = JCudaObject.allocate(size);
Expand Down Expand Up @@ -627,7 +633,11 @@ void copyFromHostToDevice()
int rowPtr[] = null;
int colInd[] = null;
double[] values = null;


tmp.recomputeNonZeros();
long nnz = tmp.getNonZeros();
mat.getMatrixCharacteristics().setNonZeros(nnz);

SparseBlock block = tmp.getSparseBlock();
boolean copyToDevice = true;
if(block == null && tmp.getNonZeros() == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

package org.apache.sysml.runtime.matrix.data;

import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
Expand Down Expand Up @@ -47,6 +49,7 @@
import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
import static jcuda.runtime.JCuda.cudaFree;
import static jcuda.runtime.JCuda.cudaMalloc;
import static jcuda.runtime.JCuda.cudaMemcpy;
Expand All @@ -64,12 +67,12 @@
import org.apache.sysml.runtime.functionobjects.GreaterThanEquals;
import org.apache.sysml.runtime.functionobjects.LessThan;
import org.apache.sysml.runtime.functionobjects.LessThanEquals;
import org.apache.sysml.runtime.functionobjects.Minus;
import org.apache.sysml.runtime.functionobjects.Multiply;
import org.apache.sysml.runtime.functionobjects.Multiply2;
import org.apache.sysml.runtime.functionobjects.NotEquals;
import org.apache.sysml.runtime.functionobjects.Or;
import org.apache.sysml.runtime.functionobjects.Plus;
import org.apache.sysml.runtime.functionobjects.Minus;
import org.apache.sysml.runtime.functionobjects.Multiply;
import org.apache.sysml.runtime.functionobjects.Power;
import org.apache.sysml.runtime.functionobjects.Power2;
import org.apache.sysml.runtime.functionobjects.ValueFunction;
Expand Down Expand Up @@ -97,9 +100,6 @@
import jcuda.jcudnn.cudnnTensorDescriptor;
import jcuda.jcusparse.JCusparse;
import jcuda.jcusparse.cusparseHandle;
import jcuda.runtime.JCuda;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;

//FIXME move could to respective instructions, this is not a block library
public class LibMatrixCUDA {
Expand Down Expand Up @@ -583,7 +583,7 @@ protected static void sparseMatrixDenseVectorMult(MatrixObject output, CSRPointe
double[] alpha = { 1 };
double[] beta = { 0 };
cusparseDcsrmv(cusparseHandle, transA, m, k, (int)A.nnz, Pointer.to(alpha), A.descr, A.val, A.rowPtr, A.colInd, B_dense, Pointer.to(beta), C_dense);

cudaDeviceSynchronize(); // Since cusparseDcsrmv is asynchronously executed
((JCudaObject)(output.getGPUObject())).setDenseMatrixCudaPointer(C_dense);
output.getGPUObject().setDeviceModify(size);
}
Expand Down

0 comments on commit 1509db7

Please sign in to comment.