-
Notifications
You must be signed in to change notification settings - Fork 224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CUSPARSE error when multiplying sparse matrix by a vector #2555
Comments
I was able to confirm the same behavior without Julia / CUDA.jl. Roughly similar (using arbitrary, regular element positions in sparse matrix) complete example that shows difference between
#include <cuda.h>
#include <cusparse_v2.h>
void cudaCall(cudaError_t const &err, char const *fname, int const &line) {
if (err != cudaSuccess) {
printf("CUDA error at %s:%d: %s\n", fname, line, cudaGetErrorString(err));
exit(1);
}
}
#define CUDA_CALL(x) cudaCall((x), __FILE__, __LINE__)
char const * cusparseGetStatusName(cusparseStatus_t const &err) {
switch (err) {
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
case CUSPARSE_STATUS_NOT_SUPPORTED:
return "CUSPARSE_STATUS_NOT_SUPPORTED";
case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:
return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
}
return "<unknown>";
}
void cusparseCall( cusparseStatus_t const &err, char const *fname, int const &line) {
if (err != CUSPARSE_STATUS_SUCCESS) {
printf("CUSPARSE error at %s:%d: status %s\n", fname, line, cusparseGetStatusName(err));
exit(1);
}
}
#define CUSPARSE_CALL(x) cusparseCall((x), __FILE__, __LINE__)
cusparseIndexType_t const indType = CUSPARSE_INDEX_64I;
typedef int64_t idx_t;
cudaDataType const valueType = CUDA_R_32F; // float
typedef float val_t;
void mwe(int64_t N, int64_t C) {
printf("Init CUSPARSE\n");
cusparseHandle_t cusparseHandle;
CUSPARSE_CALL(cusparseCreate(&cusparseHandle));
printf("Prepare CSR (host)\n");
idx_t *csrRowOffsetsHost = (idx_t *)(malloc(N * sizeof(*csrRowOffsetsHost)));
idx_t *csrColIndHost = (idx_t *)(malloc(C * sizeof(*csrColIndHost)));
val_t *csrValuesHost = (val_t *)(malloc(C * sizeof(*csrValuesHost)));
if (! (csrRowOffsetsHost && csrColIndHost && csrValuesHost)) {
perror("Host malloc failed\n");
exit(1);
}
idx_t pos = 0;
idx_t row = 0;
size_t step = (N * N) / C;
for (; pos < C; ++pos) {
idx_t i = (pos * step) / N;
idx_t j = (pos * step) % N;
val_t v = 1.0;
for (idx_t r = row; r <= i; ++r) {
csrRowOffsetsHost[row++] = pos;
}
csrColIndHost[pos] = j;
csrValuesHost[pos] = v;
++pos;
}
printf("Allocate CSR data (device)\n");
idx_t *csrRowOffsetsDev; CUDA_CALL(cudaMalloc((void **)(&csrRowOffsetsDev), N * sizeof(*csrRowOffsetsDev)));
idx_t *csrColIndDev; CUDA_CALL(cudaMalloc((void **)(&csrColIndDev), C * sizeof(*csrColIndDev)));
val_t *csrValuesDev; CUDA_CALL(cudaMalloc((void **)(&csrValuesDev), C * sizeof(*csrValuesDev)));
cudaDeviceSynchronize();
printf("CSR data: host -> device\n");
CUDA_CALL(cudaMemcpy(csrRowOffsetsDev, csrRowOffsetsHost, N * sizeof(*csrRowOffsetsDev), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(csrColIndDev, csrColIndHost, C * sizeof(*csrColIndDev), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(csrValuesDev, csrValuesHost, C * sizeof(*csrValuesDev), cudaMemcpyHostToDevice));
free(csrRowOffsetsHost); csrRowOffsetsHost = nullptr;
free(csrColIndHost); csrColIndHost = nullptr;
free(csrValuesHost); csrValuesHost = nullptr;
printf("CUSPARSE: Create CSR descriptor\n");
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
cusparseSpMatDescr_t spMat;
CUSPARSE_CALL(cusparseCreateCsr(&spMat, N, N, C, csrRowOffsetsDev, csrColIndDev, csrValuesDev, indType, indType, idxBase, valueType));
printf("CUSPARSE: Prepare data for SpMM\n");
val_t alpha = 1.0;
val_t beta = 0.0;
val_t *dnMatValues; CUDA_CALL(cudaMalloc((void **)(&dnMatValues), N * sizeof(*dnMatValues)));
val_t *dnMatCValues; CUDA_CALL(cudaMalloc((void **)(&dnMatCValues), N * sizeof(*dnMatCValues)));
val_t *out; CUDA_CALL(cudaMalloc((void **)(&out), N * sizeof(*out)));
cudaDeviceSynchronize();
cuMemsetD32(CUdeviceptr(dnMatValues), 0x3F800000, N); // very float-specific
CUDA_CALL(cudaMemset(dnMatCValues, 0, N * sizeof(*dnMatValues)));
cusparseConstDnMatDescr_t dnMat;
CUSPARSE_CALL(cusparseCreateConstDnMat(&dnMat, N, 1, N, dnMatValues, valueType, CUSPARSE_ORDER_ROW));
cusparseDnMatDescr_t dnMatC;
CUSPARSE_CALL(cusparseCreateDnMat(&dnMatC, N, 1, N, dnMatCValues, valueType, CUSPARSE_ORDER_ROW));
printf("CUSPARSE: SpMM\n");
CUSPARSE_CALL(cusparseSpMM(
cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha,
spMat,
dnMat,
&beta,
dnMatC,
valueType,
CUSPARSE_SPMM_CSR_ALG1,
out
));
printf("Cleanup\n");
CUSPARSE_CALL(cusparseDestroySpMat(spMat));
CUSPARSE_CALL(cusparseDestroyDnMat(dnMat));
CUSPARSE_CALL(cusparseDestroyDnMat(dnMatC));
CUSPARSE_CALL(cusparseDestroy(cusparseHandle));
}
int main() {
// mwe(1000000, (INT64_C(1) << 31) - 128); // this one works
mwe(1000000, (INT64_C(1) << 31) - 128 + 1);
return 0;
} |
Reported upstream as https://developer.nvidia.com/bugs/4966852 In response, I have received confirmation that it can be reproduced and it was routed to CUSPARSE team. |
Thanks for looking into this! |
Describe the bug
For sparse matrices with a relatively large number of nonzero elements I get the following error
To reproduce
The Minimal Working Example (MWE) for this bug. If I change
nz = 2^31 - 2^7
the code completes without error. This seems to be independent ofdim
. Tested for a few choices ofdim
between10^6
and10^7
.Manifest.toml
Expected behavior
The code completes as for
nz = 2^31 - 2^7
.Version info
Details on Julia:
Details on CUDA:
The text was updated successfully, but these errors were encountered: