CUDA Matrix Multiplication using Parallel Reduction

Question

I am quite new to CUDA programming, and I wanted to try an implementation of matrix multiplication using Parallel Reduction. I came up with this code, and would like clarifications on :

Why the code is returning incorrect results.
Why it is taking much longer to run than the method using shared memory described in Chapter 3, page 25 of the CUDA C Programming Guide.

For reference, I am running it on NVIDIA GeForce GTX 675M , which has a compute capability of 2.1 .

#include <cuda_runtime_api.h> #include "device_launch_parameters.h" #include <cuda.h> #include <device_functions.h> #include "cuda_runtime.h" #include <stdlib.h> #include <stdio.h> #include <time.h> #include <math.h> #define BLOCK_OPT 1024 typedef struct { int width; int height; int stride; float* elements; }Matrix; __global__ void MatMulOPT(Matrix A, Matrix B, Matrix C) { __shared__ float result[BLOCK_OPT]; int e = threadIdx.x; int row = blockIdx.x; int col = blockIdx.y; result[e] = A.elements[row*A.stride + e] * B.elements[e*B.stride + col]; __syncthreads(); if (e < 512) { result[e] += result[e + 512]; } __syncthreads(); if (e < 256) { result[e] += result[e + 256]; } __syncthreads(); if (e < 128) { result[e] += result[e + 128]; } __syncthreads(); if (e < 64) { result[e] += result[e + 64]; } __syncthreads(); if (e < 32) { result[e] += result[e + 32]; result[e] += result[e + 16]; result[e] += result[e + 8]; result[e] += result[e + 4]; result[e] += result[e + 2]; result[e] += result[e + 1]; } if (e == 0)C.elements[row*C.stride + col] = result[0]; } void MatMulCPU(Matrix A, Matrix B, Matrix C) { for (int i = 0; i < A.height; i++) { for (int j = 0; j < B.width; j++) { for (int k = 0; k < B.height; k++) { C.elements[i*C.stride + j] += A.elements[i*A.stride+k] * B.elements[k*B.stride + j]; } } } } float randomFloat() { return (float)rand() / (float)RAND_MAX; } int main() { clock_t begin, end; srand(time(NULL)); //Common Setup float cpu_t = 0, gpu_t = 0; int x = 1024; int N = x * x; size_t size = N * sizeof(float); Matrix A; A.width = x; A.stride = x; A.height = x; A.elements = (float*)malloc(size); for (int i = 0; i < N; i++) A.elements[i] = randomFloat(); Matrix B; B.width = x; B.stride = x; B.height = x; B.elements = (float*)malloc(size); for (int j = 0; j < N; j++) B.elements[j] = randomFloat(); Matrix C; C.width = x; C.stride = x; C.height = x; C.elements = (float*)malloc(size); for (int k = 0; k < N; k++) C.elements[k] = 0; Matrix D; D.width = x; D.stride = x; D.height = x; D.elements = (float*)malloc(size); for (int l = 0; l < N; l++) D.elements[l] = 0; //Execute CPU code & time it begin = clock(); MatMulCPU(A, B, D); end = clock(); cpu_t = (float)end - begin; // GPU setup Matrix d_A, d_B, d_C; d_A.width = x; d_A.stride = x; d_A.height = x; d_B.width = x; d_B.stride = x; d_B.height = x; d_C.width = x; d_C.stride = x; d_C.height = x; cudaMalloc(&d_A.elements, size); cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice); cudaMalloc(&d_B.elements, size); cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice); cudaMalloc(&d_C.elements, size); cudaMemcpy(d_C.elements, C.elements, size, cudaMemcpyHostToDevice); //Special Parameters int optBlockSize = BLOCK_OPT; dim3 optDimGrid(x, x); //Execute GPU Kernel and time it begin = clock(); cudaThreadSynchronize(); MatMulOPT << <optDimGrid, optBlockSize >> > (d_A, d_B, d_C); cudaThreadSynchronize(); end = clock(); gpu_t = (float)end - begin; cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost); //Do a memcheck bool passed = true; for (int k = 0; k < N; k++) { //printf("%f ",C.elements[k]); if (fabs(C.elements[k] -D.elements[k] ) > 1e-6) { passed = false; printf("\nFAIL\n"); printf("C[%d] = %f, D[%d] = %f\n",k,C.elements[k],k,D.elements[k]); break; } } printf("\n"); if (passed) printf("PASSED\n"); printf("CPU Elapsed Time: %f\n", cpu_t); printf("GPU Elapsed Time: %f\n", gpu_t); //Clear all GPU memory cudaFree(d_A.elements); cudaFree(d_B.elements); cudaFree(d_C.elements); //Clear all CPU memory free(A.elements); free(B.elements); free(C.elements); }

Henkersmann · Accepted Answer · 2018-02-08 15:36:11Z

Why the code is returning incorrect results.

In the last step of your reduction (e < 32) you are breaking with your method. This leads to race conditions on the same element of result, e.g.

result[e] += result[e + 16];

For e==0 this means read result[16], whereas in the same step/at the same time for e==16 the operation means write result[16].

To avoid the race condition you have two options:

Use a pointer that is declared volatile like in the document you linked (p. 78) [edited]
Continue the if( e < ...) as before or transform all the ifs into a loop with:
```
for(int size=blockdim.x/2; size>0; size/=2) if(e < size) ... 
```

Why it is taking much longer to run than the method using shared memory described in Chapter 3, page 25 of the CUDA C Programming Guide.

Accessing shared memory is much faster than accessing global memory. You are storing your intermediate result in shared memory, whereas the example you reference is storing the parts of the matrices to be read. In the example this is combined with loop tiling and every thread only loads one element of the whole tile from global memory, but later reads TILE_WIDTH * 2 elements.

The higher performance of the example directly comes from a reduced time waiting for data to be loaded from global memory.

In the parallel reduction reference, they stated that, once you reach the last 32 threads, they behave as if they all belong to the same warp, and there is no need for synchronization. I am using almost the same code for reduction.
I have updated the answer; The difference between your code and the one in the Guide is the volatile pointer.

Collectives™ on Stack Overflow

CUDA Matrix Multiplication using Parallel Reduction

1 Answer 1

2 Comments

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

2 Comments

Related