2

I'm trying to inverse a matrix using linear equation solver through cublas CUDA library.

The original equation is in form of:

Ax = B = I I - identity matrix A - The matrix I'm trying to inverse x - (hopefully) the inverse(A) matrix 

I'd like to perform LU decomposition, receive the following:

LUx = B L - is a lower triangle matrix U - is a upper triangle matrix 

Here is a good example who can show what I'm trying to do:

http://www.youtube.com/watch?v=dza5JTvMpzk

For the sake of discussion, let's assume I already have LU decomposition of A. (A = LU). I'm trying to find the inverse in a two steps series:

Ax = B = I LUx = B = I 1st step: Declare y: **Ly = I** solve 1st linear equation 2nd step, Solve the following linear equation **Ux = y** find X = inverse(A) - *Bingo!@!* 

For now I have no idea what am I doing wrong here. There might be two assumptions, either I'm not using cublas properly or cublas throws an exception for no reason.

See my code attached, there is Matlab code at the end:

 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> //#include "cublas.h" #include "cublas_v2.h" int main() { cudaError_t cudaStat; cublasStatus_t stat; cublasHandle_t handle; //cublasInit(); stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); return -1; } unsigned int size = 3; // Allowcate device matrixes float* p_h_LowerTriangle; float* p_d_LowerTriangle; p_h_LowerTriangle = new float[size*size]; p_h_LowerTriangle[0] = 1.f; p_h_LowerTriangle[1] = 0.f; p_h_LowerTriangle[2] = 0.f; p_h_LowerTriangle[3] = 2.56f; p_h_LowerTriangle[4] = 1.f; p_h_LowerTriangle[5] = 0.f; p_h_LowerTriangle[6] = 5.76f; p_h_LowerTriangle[7] = 3.5f; p_h_LowerTriangle[8] = 1.f; float* p_h_UpperTriangle; float* p_d_UpperTriangle; p_h_UpperTriangle = new float[size*size]; p_h_UpperTriangle[0] = 25.f; p_h_UpperTriangle[1] = 5.f; p_h_UpperTriangle[2] = 1.f; p_h_UpperTriangle[3] = 0.f; p_h_UpperTriangle[4] = -4.8f; p_h_UpperTriangle[5] = -1.56f; p_h_UpperTriangle[6] = 0.f; p_h_UpperTriangle[7] = 0.f; p_h_UpperTriangle[8] = 0.7f; float* p_h_IdentityMatrix; float* p_d_IdentityMatrix; p_h_IdentityMatrix = new float[size*size]; p_h_IdentityMatrix[0] = 1.f; p_h_IdentityMatrix[1] = 0.f; p_h_IdentityMatrix[2] = 0.f; p_h_IdentityMatrix[3] = 0.f; p_h_IdentityMatrix[4] = 1.f; p_h_IdentityMatrix[5] = 0.f; p_h_IdentityMatrix[6] = 0.f; p_h_IdentityMatrix[7] = 0.f; p_h_IdentityMatrix[8] = 1.f; cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float)); cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float)); cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float)); float* p_d_tempMatrix; cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float)); stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size); stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size); stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size); cudaDeviceSynchronize(); // 1st phase - solve Ly = b const float alpha = 1.f; // Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result stat = cublasStrsm(handle, cublasSideMode_t::CUBLAS_SIDE_LEFT, cublasFillMode_t::CUBLAS_FILL_MODE_LOWER, cublasOperation_t::CUBLAS_OP_N, CUBLAS_DIAG_UNIT, size, size, &alpha, p_d_LowerTriangle, size, p_d_IdentityMatrix, size); //////////////////////////////////// // TODO: printf 1st phase the results cudaDeviceSynchronize(); cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost); stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size); stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size); stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size); //////////////////////////////////// // 2nd phase - solve Ux = b stat = cublasStrsm(handle, cublasSideMode_t::CUBLAS_SIDE_LEFT, cublasFillMode_t::CUBLAS_FILL_MODE_UPPER, cublasOperation_t::CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, size, &alpha, p_d_UpperTriangle, size, p_d_IdentityMatrix, size); // TODO print the results cudaDeviceSynchronize(); //////////////////////////////////// cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost); //////////////////////////////////// cublasDestroy(handle); //cublasShutdown(); cudaFree(p_d_LowerTriangle); cudaFree(p_d_UpperTriangle); cudaFree(p_d_IdentityMatrix); free(p_h_LowerTriangle); free(p_h_UpperTriangle); free(p_h_IdentityMatrix); return 0; } 

Matlab code - works perfect:

 clear all UpperMatrix = [ 25 5 1 ; 0 -4.8 -1.56 ; 0 0 0.7 ] LowerMatrix = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1 ] IdentityMatrix = eye(3) % 1st phase solution otps1.LT = true; y = linsolve(LowerMatrix,IdentityMatrix,otps1) %2nd phase solution otps2.UT = true; y = linsolve(UpperMatrix,y,otps2) 

MATLAB output

UpperMatrix =

25.0000 5.0000 1.0000 0 -4.8000 -1.5600 0 0 0.7000 

LowerMatrix =

1.0000 0 0 2.5600 1.0000 0 5.7600 3.5000 1.0000 

IdentityMatrix =

 1 0 0 0 1 0 0 0 1 

y =

 1.0000 0 0 -2.5600 1.0000 0 3.2000 -3.5000 1.0000 

y =

 0.0476 -0.0833 0.0357 -0.9524 1.4167 -0.4643 4.5714 -5.0000 1.4286 

UpperMatrix =

25.0000 5.0000 1.0000 0 -4.8000 -1.5600 0 0 0.7000 

LowerMatrix =

1.0000 0 0 2.5600 1.0000 0 5.7600 3.5000 1.0000 

IdentityMatrix =

 1 0 0 0 1 0 0 0 1 

y =

 1.0000 0 0 -2.5600 1.0000 0 3.2000 -3.5000 1.0000 >> >> >> >> Inverse_LU_UT 

UpperMatrix =

25.0000 5.0000 1.0000 0 -4.8000 -1.5600 0 0 0.7000 

LowerMatrix =

1.0000 0 0 2.5600 1.0000 0 5.7600 3.5000 1.0000 

IdentityMatrix =

 1 0 0 0 1 0 0 0 1 

y =

 1.0000 0 0 -2.5600 1.0000 0 3.2000 -3.5000 1.0000 

y =

 0.0476 -0.0833 0.0357 -0.9524 1.4167 -0.4643 4.5714 -5.0000 1.4286 

I have no idea what am I doing wrong here, I suspect the cublasCreate operation. Every time I hit the command:

 cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); 

stat and handle variables are both valid.

But VS10 output several error messages specified the following:

First chance exception at 0x... microsoft C++ exception cudaError_enum at memory location 0x...

Some may say it's an internal cublas error message, handled by the library itself.

7
  • Can you edit your question to show the absolute shortest complete example which shows the problem. Three lines of code taken out of context and an incomplete error message isn't enough information to help you. Commented Jun 16, 2013 at 9:04
  • I'll give a whole example shortly Commented Jun 16, 2013 at 9:07
  • 1
    Is that really the shortest example you can offer showing the cublas runtime exception your original question was asking about? Commented Jun 16, 2013 at 12:32
  • LOL started by solving linear equation using LU decomposition, couldn't make it run properly yet... this is killing me Commented Jun 16, 2013 at 12:57
  • I might have some confusion between row and column leading matrix, not sure about it yet Commented Jun 16, 2013 at 13:00

1 Answer 1

1

You know what, cuBlas stores matrix in column-major way, but Matlab and C use row-major way.

Rearrange your initialization and run. Result should be good.

Sign up to request clarification or add additional context in comments.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.