I am trying to run a matrix inversion from the device. This logic works fine if called from the host.
Compilation line is as follows (Linux):
nvcc -ccbin g++ -arch=sm_35 -rdc=true simple-inv.cu -o simple-inv -lcublas_device -lcudadevrt I get the following warning that I cannot seem to resolve. (My GPU is Kepler. I don't know why it is trying to link to Maxwell routines. I have Cuda 6.5-14):
nvlink warning : SM Arch ('sm_35') not found in '/usr/local/cuda/bin/../targets/x86_64-linux/lib/libcublas_device.a:maxwell_sm50_sgemm.o' The program runs with:
handle 0 n = 3 simple-inv.cu:63 Error [an illegal memory access was encountered] The test program is as follows:
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <cuda_runtime.h> #include <cublas_v2.h> #define PERR(call) \ if (call) {\ fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\ cudaGetErrorString(cudaGetLastError()));\ exit(1);\ } #define ERRCHECK \ if (cudaPeekAtLastError()) { \ fprintf(stderr, "%s:%d Error [%s]\n", __FILE__, __LINE__,\ cudaGetErrorString(cudaGetLastError()));\ exit(1);\ } __global__ void inv_kernel(float *a_i, float *c_o, int n) { int p[3], info[1], batch; cublasHandle_t hdl; cublasStatus_t status = cublasCreate_v2(&hdl); printf("handle %d n = %d\n", status, n); info[0] = 0; batch = 1; float *a[] = {a_i}; const float *aconst[] = {a_i}; float *c[] = {c_o}; // See // http://docs.nvidia.com/cuda/pdf/CUDA_Dynamic_Parallelism_Programming_Guide.pdf //http://stackoverflow.com/questions/27094612/cublas-matrix-inversion-from-device status = cublasSgetrfBatched(hdl, n, a, n, p, info, batch); __syncthreads(); printf("rf %d info %d\n", status, info[0]); status = cublasSgetriBatched(hdl, n, aconst, n, p, c, n, info, batch); __syncthreads(); printf("ri %d info %d\n", status, info[0]); cublasDestroy_v2(hdl); printf("done\n"); } static void run_inv(float *in, float *out, int n) { float *a_d, *c_d; PERR(cudaMalloc(&a_d, n*n*sizeof(float))); PERR(cudaMalloc(&c_d, n*n*sizeof(float))); PERR(cudaMemcpy(a_d, in, n*n*sizeof(float), cudaMemcpyHostToDevice)); inv_kernel<<<1, 1>>>(a_d, c_d, n); cudaDeviceSynchronize(); ERRCHECK; PERR(cudaMemcpy(out, c_d, n*n*sizeof(float), cudaMemcpyDeviceToHost)); PERR(cudaFree(a_d)); PERR(cudaFree(c_d)); } int main(int argc, char **argv) { float c[9]; float a[] = { 1, 2, 3, 0, 4, 5, 1, 0, 6 }; run_inv(a, c, 3); return 0; } I have followed the guide at http://docs.nvidia.com/cuda/cublas/index.html#device-api section 2.1.9, but I suspect I have overlooked something.
Note: Edited on 11/24 to use correct pointer inputs. This still reports illegal memory access inside the kernel.
(float**)a_ilooks very suspicious. Surely you mean to pass the address ofa_iand not its value?*a[] = {a_i}; cublasSgetrfBatched(..., a, ....)andcublasSgetrfBatched(..., (float**)a_i, ...)are not equivalent, and if you think they are, then you need to revise the theory of pointers in C++.