I'm trying to inverse a matrix using linear equation solver through cublas CUDA library.
The original equation is in form of:
Ax = B = I I - identity matrix A - The matrix I'm trying to inverse x - (hopefully) the inverse(A) matrix I'd like to perform LU decomposition, receive the following:
LUx = B L - is a lower triangle matrix U - is a upper triangle matrix Here is a good example who can show what I'm trying to do:
http://www.youtube.com/watch?v=dza5JTvMpzk
For the sake of discussion, let's assume I already have LU decomposition of A. (A = LU). I'm trying to find the inverse in a two steps series:
Ax = B = I LUx = B = I 1st step: Declare y: **Ly = I** solve 1st linear equation 2nd step, Solve the following linear equation **Ux = y** find X = inverse(A) - *Bingo!@!* For now I have no idea what am I doing wrong here. There might be two assumptions, either I'm not using cublas properly or cublas throws an exception for no reason.
See my code attached, there is Matlab code at the end:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> //#include "cublas.h" #include "cublas_v2.h" int main() { cudaError_t cudaStat; cublasStatus_t stat; cublasHandle_t handle; //cublasInit(); stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); return -1; } unsigned int size = 3; // Allowcate device matrixes float* p_h_LowerTriangle; float* p_d_LowerTriangle; p_h_LowerTriangle = new float[size*size]; p_h_LowerTriangle[0] = 1.f; p_h_LowerTriangle[1] = 0.f; p_h_LowerTriangle[2] = 0.f; p_h_LowerTriangle[3] = 2.56f; p_h_LowerTriangle[4] = 1.f; p_h_LowerTriangle[5] = 0.f; p_h_LowerTriangle[6] = 5.76f; p_h_LowerTriangle[7] = 3.5f; p_h_LowerTriangle[8] = 1.f; float* p_h_UpperTriangle; float* p_d_UpperTriangle; p_h_UpperTriangle = new float[size*size]; p_h_UpperTriangle[0] = 25.f; p_h_UpperTriangle[1] = 5.f; p_h_UpperTriangle[2] = 1.f; p_h_UpperTriangle[3] = 0.f; p_h_UpperTriangle[4] = -4.8f; p_h_UpperTriangle[5] = -1.56f; p_h_UpperTriangle[6] = 0.f; p_h_UpperTriangle[7] = 0.f; p_h_UpperTriangle[8] = 0.7f; float* p_h_IdentityMatrix; float* p_d_IdentityMatrix; p_h_IdentityMatrix = new float[size*size]; p_h_IdentityMatrix[0] = 1.f; p_h_IdentityMatrix[1] = 0.f; p_h_IdentityMatrix[2] = 0.f; p_h_IdentityMatrix[3] = 0.f; p_h_IdentityMatrix[4] = 1.f; p_h_IdentityMatrix[5] = 0.f; p_h_IdentityMatrix[6] = 0.f; p_h_IdentityMatrix[7] = 0.f; p_h_IdentityMatrix[8] = 1.f; cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float)); cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float)); cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float)); float* p_d_tempMatrix; cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float)); stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size); stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size); stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size); cudaDeviceSynchronize(); // 1st phase - solve Ly = b const float alpha = 1.f; // Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result stat = cublasStrsm(handle, cublasSideMode_t::CUBLAS_SIDE_LEFT, cublasFillMode_t::CUBLAS_FILL_MODE_LOWER, cublasOperation_t::CUBLAS_OP_N, CUBLAS_DIAG_UNIT, size, size, &alpha, p_d_LowerTriangle, size, p_d_IdentityMatrix, size); //////////////////////////////////// // TODO: printf 1st phase the results cudaDeviceSynchronize(); cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost); stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size); stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size); stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size); //////////////////////////////////// // 2nd phase - solve Ux = b stat = cublasStrsm(handle, cublasSideMode_t::CUBLAS_SIDE_LEFT, cublasFillMode_t::CUBLAS_FILL_MODE_UPPER, cublasOperation_t::CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, size, &alpha, p_d_UpperTriangle, size, p_d_IdentityMatrix, size); // TODO print the results cudaDeviceSynchronize(); //////////////////////////////////// cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost); //////////////////////////////////// cublasDestroy(handle); //cublasShutdown(); cudaFree(p_d_LowerTriangle); cudaFree(p_d_UpperTriangle); cudaFree(p_d_IdentityMatrix); free(p_h_LowerTriangle); free(p_h_UpperTriangle); free(p_h_IdentityMatrix); return 0; } Matlab code - works perfect:
clear all UpperMatrix = [ 25 5 1 ; 0 -4.8 -1.56 ; 0 0 0.7 ] LowerMatrix = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1 ] IdentityMatrix = eye(3) % 1st phase solution otps1.LT = true; y = linsolve(LowerMatrix,IdentityMatrix,otps1) %2nd phase solution otps2.UT = true; y = linsolve(UpperMatrix,y,otps2) MATLAB output
UpperMatrix =
25.0000 5.0000 1.0000 0 -4.8000 -1.5600 0 0 0.7000 LowerMatrix =
1.0000 0 0 2.5600 1.0000 0 5.7600 3.5000 1.0000 IdentityMatrix =
1 0 0 0 1 0 0 0 1 y =
1.0000 0 0 -2.5600 1.0000 0 3.2000 -3.5000 1.0000 y =
0.0476 -0.0833 0.0357 -0.9524 1.4167 -0.4643 4.5714 -5.0000 1.4286 UpperMatrix =
25.0000 5.0000 1.0000 0 -4.8000 -1.5600 0 0 0.7000 LowerMatrix =
1.0000 0 0 2.5600 1.0000 0 5.7600 3.5000 1.0000 IdentityMatrix =
1 0 0 0 1 0 0 0 1 y =
1.0000 0 0 -2.5600 1.0000 0 3.2000 -3.5000 1.0000 >> >> >> >> Inverse_LU_UT UpperMatrix =
25.0000 5.0000 1.0000 0 -4.8000 -1.5600 0 0 0.7000 LowerMatrix =
1.0000 0 0 2.5600 1.0000 0 5.7600 3.5000 1.0000 IdentityMatrix =
1 0 0 0 1 0 0 0 1 y =
1.0000 0 0 -2.5600 1.0000 0 3.2000 -3.5000 1.0000 y =
0.0476 -0.0833 0.0357 -0.9524 1.4167 -0.4643 4.5714 -5.0000 1.4286 I have no idea what am I doing wrong here, I suspect the cublasCreate operation. Every time I hit the command:
cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); stat and handle variables are both valid.
But VS10 output several error messages specified the following:
First chance exception at 0x... microsoft C++ exception cudaError_enum at memory location 0x...
Some may say it's an internal cublas error message, handled by the library itself.