I am trying to make an existing piece of software that uses hand tuned sparse multiplication of special CSC matrices that have exactly k nonzero elements per column. I decided to use cusparse for the job, but unfortunately I encounter that the matrix multiplication takes over 7 seconds in some cases, which is much slower than the CPU version of the code. (largest sparse matrix concerned is 19871x1000 largest dense matrix concerned is 1000*150, nnz = 101000).
When trying to reproduce the problem in a self contained example, I always encounter an "illegal memory access error" when nnz != sparse_cols.
After some investigation turns out that if I increase the size of matrices 10fold the problem disappears. If I make the matrices small enough I don't experience crashes. However with large matrices the sparse matrix has to not cross over some degree of denseness, otherwise multiplication produces a bunch of illegal memory accesses. Here is the code that exibits the problem:
#include <cuda.h> #include <cusparse.h> #include <iostream> #include <stdlib.h> #define CALL_CUDA( err ) \ { if (err != cudaSuccess) \ {std::cout<<"cuda Error "<< cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<<__LINE__<<"\n"; exit(EXIT_FAILURE); }\ } int main(){ //cusparse status and handle cusparseStatus_t status; cusparseHandle_t handle = 0; status = cusparseCreate(&handle); if (status != CUSPARSE_STATUS_SUCCESS){ std::cout << "Error creating handle: " << status << std::endl; } //Set matrix description cusparseMatDescr_t descr; //Describe the matrices cusparseCreateMatDescr(&descr); cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); //Sparse matrix properties int sparse_rows = 19871; int sparse_cols = 1000; int nnz_new = 101000; //int nnz_new = 1000; //Works with that value //Dense matrix properties int bmat_rows = 1000; int bmat_cols = 150; //Generate a special type of sparse matrix that has exactly k nonzero elements in each column in CSC format float * amat_vals; CALL_CUDA(cudaMallocHost((void **)&amat_vals, nnz_new*sizeof(float))); int * amat_idx; CALL_CUDA(cudaMallocHost((void **)&amat_idx, nnz_new*sizeof(int))); int * crccolptr; CALL_CUDA(cudaMallocHost((void **)&crccolptr, (sparse_cols+1)*sizeof(int))); //Fill in values with random values for (int i = 0; i < nnz_new; i++){ amat_vals[i] = (float)rand()/(float)RAND_MAX; } //Generate indexes for those rows for (int i = 0; i < nnz_new; i++){ amat_idx[i] = rand() % (sparse_rows - 1); } //generate crccolptr int k = (int)(nnz_new/sparse_cols); //Number of elements per row for (int i = 0; i < sparse_cols; i++){ crccolptr[i] = k*i; } crccolptr[sparse_cols] = nnz_new; //Generate bmat_array with random floats float * bmat_array; CALL_CUDA(cudaMallocHost((void **)&bmat_array, bmat_rows*bmat_cols*sizeof(float))); for (int i = 0; i < bmat_rows*bmat_cols; i++){ bmat_array[i] = (float)rand()/(float)RAND_MAX; } //generate array for output float * outmatrix_test; CALL_CUDA(cudaMallocHost((void **)&outmatrix_test, sparse_rows*bmat_cols*sizeof(float))); //Allocate and copy device memory for sparse matrix float * cudavals; int * colIdx; int * colPtr; CALL_CUDA(cudaMalloc((void **)&colPtr, (sparse_cols + 1)*sizeof(int))); CALL_CUDA(cudaMemcpy(colPtr, crccolptr, (sparse_cols + 1)*sizeof(int), cudaMemcpyHostToDevice)); CALL_CUDA(cudaMalloc((void **)&cudavals, nnz_new*sizeof(float))); CALL_CUDA(cudaMalloc((void **)&colIdx, nnz_new*sizeof(int))); CALL_CUDA(cudaMemcpy(cudavals, amat_vals, nnz_new*sizeof(float), cudaMemcpyHostToDevice)); CALL_CUDA(cudaMemcpy(colIdx, amat_idx, nnz_new*sizeof(int), cudaMemcpyHostToDevice)); //Allocate and copy device memory for dense matrix float * B_gpumatrix; CALL_CUDA(cudaMalloc((void **)&B_gpumatrix, bmat_rows*bmat_cols*sizeof(float))); CALL_CUDA(cudaMemcpy(B_gpumatrix, bmat_array, bmat_rows*bmat_cols*sizeof(float), cudaMemcpyHostToDevice)); //Allocate output matrix float * outmatrix_gpu; CALL_CUDA(cudaMalloc((void **)&outmatrix_gpu, (sparse_rows*bmat_cols)*sizeof(float))); //sparse_cols is passed as sparse_rows, because we're multiplying a CSC matrix instead of a CSR so we need // to transpose it and invert the rows and columns. const float alpha = 1.0; const float beta = 0.0; /* float * outmatrix_gpu2; CALL_CUDA(cudaMalloc((void **)&outmatrix_gpu2, (sparse_rows*sparse_cols)*sizeof(float))); cusparseStatus_t mat_mul = cusparseScsc2dense(handle, sparse_rows, sparse_cols, descr, cudavals, colIdx, colPtr, outmatrix_gpu2, sparse_rows); float * outmatrix_test2; CALL_CUDA(cudaMallocHost((void **)&outmatrix_test2, sparse_rows*sparse_cols*sizeof(float))); CALL_CUDA(cudaMemcpy(outmatrix_test2, outmatrix_gpu2, (sparse_rows*sparse_cols)*sizeof(float), cudaMemcpyDeviceToHost)); */ cusparseStatus_t mat_mul = cusparseScsrmm(handle, //Cusparse handle CUSPARSE_OPERATION_TRANSPOSE, //Transposing the matrix sparse_cols, //Number of sparse rows. Since we're using CSC matrix it's the columns. bmat_cols, //Number of columns of the dense matrix sparse_rows, //Number of sparse cols, Since we're using CSC matrix it's the rows nnz_new, //Non zero elements &alpha, //Pointer to alpha (1.0) descr, //Description of the matrix cudavals, //The values vector colPtr, //The column pointer colIdx, //The indexes of the sparse matrix B_gpumatrix, //Dense matrix array bmat_rows, //ldb - the rows of the dense matrix &beta, //Pointer to beta. It's 0 outmatrix_gpu, //Pointer to the output matrix sparse_rows); //ldc - leading dimensions of the output matrix. if (mat_mul != CUSPARSE_STATUS_SUCCESS){ std::cout << "MULTIPLICATION ERROR: " << mat_mul << std::endl; } cudaThreadSynchronize(); //Syncs before copy back to memory should not be necessary cudaDeviceSynchronize(); //Copy matrix back to host CALL_CUDA(cudaMemcpy(outmatrix_test, outmatrix_gpu, (sparse_rows*bmat_cols)*sizeof(float), cudaMemcpyDeviceToHost)); CALL_CUDA(cudaFree(outmatrix_gpu)); CALL_CUDA(cudaFree(cudavals)); CALL_CUDA(cudaFree(colPtr)); CALL_CUDA(cudaFree(colIdx)); CALL_CUDA(cudaFree(B_gpumatrix)); CALL_CUDA(cudaFreeHost(crccolptr)); CALL_CUDA(cudaFreeHost(amat_vals)); CALL_CUDA(cudaFreeHost(amat_idx)); CALL_CUDA(cudaFreeHost(bmat_array)); CALL_CUDA(cudaFreeHost(outmatrix_test)); return 1; } I believe i am generating a valid sparse matrix, because I can convert it to dense one using the appropariate cusparse function without triggering any invalid memory accesses.
When running the above code under cuda-memcheck you can see many illegal accesses from within the cusparseScsrmm. Running without cuda-memcheck you would see an error in the first cuda operation after the matrix multiplication.
Any ideas what I am doing wrong? I hope that if I can solve this problem, I would be able to diagnoze (or at least isolate) a self contained example that exhibits the painfully slow matrix multiplications.
EDIT:
Using smaller matrices I don't experience the problem. sparse matrix with 50*200 works fine for NNZ until about 1000, but takes forever with NNZ = 5000 (I killed it after half a minute). Increasing matrix size to 200*500 works performs instantaneously with NNZ = 5000.... Strange.
EDIT2:
The original number of nnz works if I increase the size of the matrices 10fold.