Reducing the rows of a matrix can be solved by using CUDA Thrust in three ways (they may not be the only ones, but addressing this point is out of scope). As also recognized by the same OP, using CUDA Thrust is preferable for such a kind of problem. Also, an approach using cuBLAS is possible.
APPROACH #1 - reduce_by_key
This is the approach suggested at this Thrust example page. It includes a variant using make_discard_iterator.
APPROACH #2 - transform
This is the approach suggested by Robert Crovella at CUDA Thrust: reduce_by_key on only some values in an array, based off values in a “key” array.
APPROACH #3 - inclusive_scan_by_key
This is the approach suggested by Eric at How to normalize matrix columns in CUDA with max performance?.
APPROACH #4 - cublas<t>gemv
It uses cuBLAS gemv to multiply the relevant matrix by a column of 1's.
THE FULL CODE
Here is the code condensing the two approaches. The Utilities.cu and Utilities.cuh files are mantained here and omitted here. The TimingGPU.cu and TimingGPU.cuh are maintained here and are omitted as well.
#include <cublas_v2.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/generate.h> #include <thrust/reduce.h> #include <thrust/functional.h> #include <thrust/random.h> #include <thrust/sequence.h> #include <stdio.h> #include <iostream> #include "Utilities.cuh" #include "TimingGPU.cuh" // --- Required for approach #2 __device__ float *vals; /**************************************************************/ /* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */ /**************************************************************/ template <typename T> struct linear_index_to_row_index : public thrust::unary_function<T,T> { T Ncols; // --- Number of columns __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {} __host__ __device__ T operator()(T i) { return i / Ncols; } }; /******************************************/ /* ROW_REDUCTION - NEEDED FOR APPROACH #2 */ /******************************************/ struct row_reduction { const int Ncols; // --- Number of columns row_reduction(int _Ncols) : Ncols(_Ncols) {} __device__ float operator()(float& x, int& y ) { float temp = 0.f; for (int i = 0; i<Ncols; i++) temp += vals[i + (y*Ncols)]; return temp; } }; /**************************/ /* NEEDED FOR APPROACH #3 */ /**************************/ template<typename T> struct MulC: public thrust::unary_function<T, T> { T C; __host__ __device__ MulC(T c) : C(c) { } __host__ __device__ T operator()(T x) { return x * C; } }; /********/ /* MAIN */ /********/ int main() { const int Nrows = 5; // --- Number of rows const int Ncols = 8; // --- Number of columns // --- Random uniform integer distribution between 10 and 99 thrust::default_random_engine rng; thrust::uniform_int_distribution<int> dist(10, 99); // --- Matrix allocation and initialization thrust::device_vector<float> d_matrix(Nrows * Ncols); for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng); TimingGPU timerGPU; /***************/ /* APPROACH #1 */ /***************/ timerGPU.StartCounter(); // --- Allocate space for row sums and indices thrust::device_vector<float> d_row_sums(Nrows); thrust::device_vector<int> d_row_indices(Nrows); // --- Compute row sums by summing values with equal row indices //thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Ncols)), // thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Ncols)) + (Nrows*Ncols), // d_matrix.begin(), // d_row_indices.begin(), // d_row_sums.begin(), // thrust::equal_to<int>(), // thrust::plus<float>()); thrust::reduce_by_key( thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols)), thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols)) + (Nrows*Ncols), d_matrix.begin(), thrust::make_discard_iterator(), d_row_sums.begin()); printf("Timing for approach #1 = %f\n", timerGPU.GetCounter()); // --- Print result for(int i = 0; i < Nrows; i++) { std::cout << "[ "; for(int j = 0; j < Ncols; j++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "] = " << d_row_sums[i] << "\n"; } /***************/ /* APPROACH #2 */ /***************/ timerGPU.StartCounter(); thrust::device_vector<float> d_row_sums_2(Nrows, 0); float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]); gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *))); thrust::transform(d_row_sums_2.begin(), d_row_sums_2.end(), thrust::counting_iterator<int>(0), d_row_sums_2.begin(), row_reduction(Ncols)); printf("Timing for approach #2 = %f\n", timerGPU.GetCounter()); for(int i = 0; i < Nrows; i++) { std::cout << "[ "; for(int j = 0; j < Ncols; j++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "] = " << d_row_sums_2[i] << "\n"; } /***************/ /* APPROACH #3 */ /***************/ timerGPU.StartCounter(); thrust::device_vector<float> d_row_sums_3(Nrows, 0); thrust::device_vector<float> d_temp(Nrows * Ncols); thrust::inclusive_scan_by_key( thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols)), thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols)) + (Nrows*Ncols), d_matrix.begin(), d_temp.begin()); thrust::copy( thrust::make_permutation_iterator( d_temp.begin() + Ncols - 1, thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Ncols))), thrust::make_permutation_iterator( d_temp.begin() + Ncols - 1, thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Ncols))) + Nrows, d_row_sums_3.begin()); printf("Timing for approach #3 = %f\n", timerGPU.GetCounter()); for(int i = 0; i < Nrows; i++) { std::cout << "[ "; for(int j = 0; j < Ncols; j++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "] = " << d_row_sums_3[i] << "\n"; } /***************/ /* APPROACH #4 */ /***************/ cublasHandle_t handle; timerGPU.StartCounter(); cublasSafeCall(cublasCreate(&handle)); thrust::device_vector<float> d_row_sums_4(Nrows); thrust::device_vector<float> d_ones(Ncols, 1.f); float alpha = 1.f; float beta = 0.f; cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols, thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_row_sums_4.data()), 1)); printf("Timing for approach #4 = %f\n", timerGPU.GetCounter()); for(int i = 0; i < Nrows; i++) { std::cout << "[ "; for(int j = 0; j < Ncols; j++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "] = " << d_row_sums_4[i] << "\n"; } return 0; }
TIMING RESULTS (tested on a Kepler K20c)
Matrix size #1 #1-v2 #2 #3 #4 #4 (no plan) 100 x 100 0.63 1.00 0.10 0.18 139.4 0.098 1000 x 1000 1.25 1.12 3.25 1.04 101.3 0.12 5000 x 5000 8.38 15.3 16.05 13.8 111.3 1.14 100 x 5000 1.25 1.52 2.92 1.75 101.2 0.40 5000 x 100 1.35 1.99 0.37 1.74 139.2 0.14
It seems that approaches #1 and #3 outperform approach #2, except in the cases of small numbers of columns. The best approach, however, is approach #4, which is significantly more convenient than the others, provided that the time needed to create the plan can be amortized during the computation.