Row-wise/column-wise operations on matrices with CUDA

Question

I'm relatively new to CUDA programming. I have understood the programming model and have already written few basic kernels. I know how to apply a kernel to each element of a matrix (stored as 1D array), but now I'm trying to figure out how to apply the same operation to the same row/column of the input matrix.

Let's say I have a MxN matrix and a vector of length N. I would like to sum (but it can be any other math operation) the vector to each row of the matrix. The serial code of such operation is:

for (int c = 0; c < columns; c++) { for (int r = 0; r < rows; r++) { M[r * rows + c] += V[c]; } }

Now the CUDA code for doing this operation should be quite straightforward: I should spawn as many cuda threads as the elements and apply this kernel:

__global__ void kernel(const unsigned int size, float* matrix, const float* vector) { // get the current element index for the thread unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { // sum the current element with the matrix[idx] += vector[threadIdx.x]; } }

It runs but the result is not correct. Actually, it's correct if I transpose the matrix after the kernel completes its work. Unfortunately, I have no clue why it works in this way. Could you help me to figure out this problem? Thanks in advance.

EDIT #1

I launch the kernel using:

int block_size = 64; int grid_size = (M * N + block_size - 1) / block_size; kernel<<<grid_size, block_size>>>(M * N, matrix, vector);

EDIT #2

I solved the problem by fixing the CPU code as suggested by @RobertCrovella:

M[r * columns + c] += V[c];

It should match the outer for, that is, over the columns.

Could you include how you launch the kernel? On what grid and block sizes? — chrk
– chrk, Commented Apr 11, 2015 at 16:21
Your kernel should work if you launch one threadblock per row, and as many threads per block as there are columns in your matrix. If you want to sum the vector to each row of your matrix, and assuming c-style row-major storage, your CPU code is actually incorrect, it should be M[r * columns +c] "Questions seeking debugging help ("why isn't this code working?") must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself. See: MCVE" — Robert Crovella
– Robert Crovella, Commented Apr 11, 2015 at 16:34
I believe variables block_size and grid_size should be of dim3 type, which is a struct, instead of int. Not sure whether it should work with int too. — chrk
– chrk, Commented Apr 11, 2015 at 16:48
here is a completely worked example, demonstrating (I believe) proper behavior, with your exact kernel. @chrk one dimensional block and grid dimensions can be int instead of dim3 — Robert Crovella
– Robert Crovella, Commented Apr 11, 2015 at 16:50
@gr1ph00n if you're happy with what you've got, why don't you post an answer indicating what you did. I don't believe your changes are correct, by the way, unless your vector length happens to be 64 (i.e. 64 columns in your matrix) vector[threadIdx.x] will span 64 elements if your block_size is 64. — Robert Crovella
– Robert Crovella, Commented Apr 11, 2015 at 16:54

Community · Accepted Answer · 2017-05-23 12:22:28Z

The kernel shown in the question could be used without modification to sum a vector to each of the rows of a matrix (assuming c-style row-major storage), subject to certain limitations. A demonstration is here.

The main limitation of that approach is that the maximum vector length and therefore matrix width that can be handled is equal to the maximum number of threads per block, which on current CUDA 7-supported GPUs is 1024.

We can eliminate that limitation with a slight modification to the vector indexing, and passing the row width (number of columns) as a parameter to the matrix. With this modification, we should be able to handle arbitrary matrix (and vector) sizes.

EDIT: based on discussion/comments, OP wants to know how to handle row-major or column major underlying storage. The following example uses a templated kernel to select either row-major or column major underlying storage, and also shows one possible CUBLAS method for doing a add-vector-to-each-matrix-row operation using rank-1 update function:

$ cat t712.cu #include <iostream> #include <cublas_v2.h> #define ROWS 20 #define COLS 10 #define nTPB 64 #define ROW_MAJOR 0 #define COL_MAJOR 1 template <int select, typename T> __global__ void vec_mat_row_add(const unsigned int height, const unsigned int width, T* matrix, const T* vector) { // get the current element index for the thread unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < height*width) { // sum the current element with the if (select == ROW_MAJOR) matrix[idx] += vector[idx%width]; else // COL_MAJOR matrix[idx] += vector[idx/height]; } } int main(){ float *h_mat, *d_mat, *h_vec, *d_vec; const unsigned int msz = ROWS*COLS*sizeof(float); const unsigned int vsz = COLS*sizeof(float); h_mat = (float *)malloc(msz); h_vec = (float *)malloc(vsz); cudaMalloc(&d_mat, msz); cudaMalloc(&d_vec, vsz); for (int i=0; i<COLS; i++) h_vec[i] = i; // set vector to 0,1,2, ... cudaMemcpy(d_vec, h_vec, vsz, cudaMemcpyHostToDevice); // test row-major case cudaMemset(d_mat, 0, msz); // set matrix to zero vec_mat_row_add<ROW_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec); cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost); std::cout << "Row-major result: " << std::endl; for (int i = 0; i < ROWS; i++){ for (int j = 0; j < COLS; j++) std::cout << h_mat[i*COLS+j] << " "; std::cout << std::endl;} // test column-major case cudaMemset(d_mat, 0, msz); // set matrix to zero vec_mat_row_add<COL_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec); cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost); std::cout << "Column-major result: " << std::endl; for (int i = 0; i < ROWS; i++){ for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " "; std::cout << std::endl;} // test CUBLAS, doing matrix-vector add using <T>ger cudaMemset(d_mat, 0, msz); // set matrix to zero float *d_ones, *h_ones; h_ones = (float *)malloc(ROWS*sizeof(float)); for (int i =0; i<ROWS; i++) h_ones[i] = 1.0f; cudaMalloc(&d_ones, ROWS*sizeof(float)); cudaMemcpy(d_ones, h_ones, ROWS*sizeof(float), cudaMemcpyHostToDevice); cublasHandle_t ch; cublasCreate(&ch); float alpha = 1.0f; cublasStatus_t stat = cublasSger(ch, ROWS, COLS, &alpha, d_ones, 1, d_vec, 1, d_mat, ROWS); if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;} cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost); std::cout << "CUBLAS Column-major result: " << std::endl; for (int i = 0; i < ROWS; i++){ for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " "; std::cout << std::endl;} return 0; } $ nvcc -o t712 t712.cu -lcublas $ ./t712 Row-major result: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 Column-major result: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 CUBLAS Column-major result: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 $

For brevity of presentation, I've not included proper cuda error checking, but that is always a good idea any time you are having trouble with a CUDA code. As a proxy/shortcut, you can run your code with cuda-memcheck as a quick check to see if there are any CUDA errors.

Note that we expect all 3 printouts to be identical because that is actually the correct way to display the matrix, regardless of whether the underlying storage is row-major or column-major. The difference in underlying storage is accounted for in the for-loops handling the display output.

And what if i would like to store the data using fortran-style column-major storage? I explain, I am working with cublas and data is already stored as fortran style? Should I just use the column-major index calculation? Or is there something I need to change in the kernel invocation? @RovertCrovella: thanks for your tips!
Why not take a stab at it yourself? If you need help, ask a new question. I don't know if you want to add a vector to rows in column major storage or to columns in column major storage. By the way, if you're using CUBLAS, you may be able to use CUBLAS to do these operations. That might be easier/faster/better.
First of all, sorry if i may have confused you. I am trying to understand how things work and then try to implement them in the way I need. Actually I am trying to develop a java matrix library using jcublas. I implemented many operations, yet I still need to implement few cuda kernel for doing others operations which are not available in jcublas, like for example: add a vector to each row of a matrix. I d like to keep all data stored in fortran-style in order to use cublas, thus i need to adapt "my" cuda kernels to work column-major matrix. I hope you get what i am trying to achieve.
I get what you're trying to achieve. I've updated my answer to show row-major method, column-major method, and a CUBLAS (column-major) method. If you try and work some of this out yourself, you may improve your skill set. Given a worked row-major solution, and an understanding of the underlying storage difference between row-major and column-major, it's a good test of your understanding (or skill builder) to work out how to convert a row-major example to a column-major example yourself.
Yeah, I did the same column-major method you did and I wanted to post it but you were faster :) I figured out how to handle column-major in kernels, it's quite easy once you get how stuff work! About <T>sger, I think I missed it when i read the cublas documentation, yet I think it's better to use the custom kernel for memory optimization since cublas needs one more vector. Thanks for your time and patience.

Vitality · Accepted Answer · 2015-06-11 05:33:02Z

Robert Crovella has already answered this question providing examples using explicit CUDA kernels and cuBLAS.

I find it useful, for future references, to show also an example on how performing row-wise or column-wise operations using CUDA Thrust. In particular, I'm focusing on two problems:

Summing a column vector to all matrix columns;
Summing a row vector to all matrix rows.

The generality of thrust::transform enables to generalize the example below to elementwise operations other than the sum (e.g., multiplications, divisions, subtractions etc.).

#include <thrust/device_vector.h> #include <thrust/reduce.h> #include <thrust/random.h> #include <thrust/sort.h> #include <thrust/unique.h> #include <thrust/equal.h> using namespace thrust::placeholders; /*************************************/ /* CONVERT LINEAR INDEX TO ROW INDEX */ /*************************************/ template <typename T> struct linear_index_to_row_index : public thrust::unary_function<T,T> { T Ncols; // --- Number of columns __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {} __host__ __device__ T operator()(T i) { return i / Ncols; } }; /********/ /* MAIN */ /********/ int main() { /**************************/ /* SETTING UP THE PROBLEM */ /**************************/ const int Nrows = 10; // --- Number of rows const int Ncols = 3; // --- Number of columns // --- Random uniform integer distribution between 0 and 100 thrust::default_random_engine rng; thrust::uniform_int_distribution<int> dist1(0, 100); // --- Random uniform integer distribution between 1 and 4 thrust::uniform_int_distribution<int> dist2(1, 4); // --- Matrix allocation and initialization thrust::device_vector<float> d_matrix(Nrows * Ncols); for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist1(rng); // --- Column vector allocation and initialization thrust::device_vector<float> d_column(Nrows); for (size_t i = 0; i < d_column.size(); i++) d_column[i] = (float)dist2(rng); // --- Row vector allocation and initialization thrust::device_vector<float> d_row(Ncols); for (size_t i = 0; i < d_row.size(); i++) d_row[i] = (float)dist2(rng); printf("\n\nOriginal matrix\n"); for(int i = 0; i < Nrows; i++) { std::cout << "[ "; for(int j = 0; j < Ncols; j++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "]\n"; } printf("\n\nColumn vector\n"); for(int i = 0; i < Nrows; i++) std::cout << d_column[i] << "\n"; printf("\n\nRow vector\n"); for(int i = 0; i < Ncols; i++) std::cout << d_row[i] << " "; /*******************************************************/ /* ADDING THE SAME COLUMN VECTOR TO ALL MATRIX COLUMNS */ /*******************************************************/ thrust::device_vector<float> d_matrix2(d_matrix); thrust::transform(d_matrix.begin(), d_matrix.end(), thrust::make_permutation_iterator( d_column.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols))), d_matrix2.begin(), thrust::plus<float>()); printf("\n\nColumn + Matrix -> Result matrix\n"); for(int i = 0; i < Nrows; i++) { std::cout << "[ "; for(int j = 0; j < Ncols; j++) std::cout << d_matrix2[i * Ncols + j] << " "; std::cout << "]\n"; } /*************************************************/ /* ADDING THE SAME ROW VECTOR TO ALL MATRIX ROWS */ /*************************************************/ thrust::device_vector<float> d_matrix3(d_matrix); thrust::transform(thrust::make_permutation_iterator( d_matrix.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), thrust::make_permutation_iterator( d_matrix.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)) + Nrows * Ncols, thrust::make_permutation_iterator( d_row.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows))), thrust::make_permutation_iterator( d_matrix3.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), thrust::plus<float>()); printf("\n\nRow + Matrix -> Result matrix\n"); for(int i = 0; i < Nrows; i++) { std::cout << "[ "; for(int j = 0; j < Ncols; j++) std::cout << d_matrix3[i * Ncols + j] << " "; std::cout << "]\n"; } return 0; }

Collectives™ on Stack Overflow

Row-wise/column-wise operations on matrices with CUDA

2 Answers 2

5 Comments

Comments

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

5 Comments

Comments

Linked

Related