I have two programs. the only difference is that one uses constant memory to store input while the other uses global memory.I want to know why the global memory one is faster than the constant memory one? They both compute dot product btw 2 matrices
#include<cuda_runtime.h> #include<cuda.h> #include<stdio.h> #include<stdlib.h> #define intMin(a,b) ((a<b)?a:b) //Threads per block #define TPB 128 //blocks per grid #define BPG intMin(128, ((n+TPB-1)/TPB)) const int n = 4; __constant__ float deva[n],devb[n]; __global__ void addVal( float *c){ int tid = blockIdx.x * blockDim.x + threadIdx.x; //Using shared memory to temporary store results __shared__ float cache[TPB]; float temp = 0; while(tid < n){ temp += deva[tid] * devb[tid]; tid += gridDim.x * blockDim.x; } cache[threadIdx.x] = temp; __syncthreads(); int i = blockDim.x/2; while( i !=0){ if(threadIdx.x < i){ cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ; } __syncthreads(); i = i/2; } if(threadIdx.x == 1){ c[blockIdx.x ] = cache[0]; } } int main(){ float a[n] , b[n] , c[BPG]; //float *deva, *devb, *devc; float *devc; int i; //Filling with random values to test for( i =0; i< n; i++){ a[i] = i; b[i] = i*2; } //cudaMalloc((void**)&deva, n * sizeof(float)); //cudaMalloc((void**)&devb, n * sizeof(float)); cudaMalloc((void**)&devc, BPG * sizeof(float)); //cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice); cudaMemcpyToSymbol(deva, a, n * sizeof(float)); cudaMemcpyToSymbol(devb, b, n * sizeof(float)); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); //Call function to do dot product addVal<<<BPG, TPB>>>( devc); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float time; cudaEventElapsedTime(&time,start, stop); printf("The elapsed time is: %f\n", time); //copy result back cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost); float sum =0 ; for ( i = 0 ; i< BPG; i++){ sum+=c[i]; } //display answer printf("%f\n",sum); getchar(); return 0; } Below is the global memory version.
#include<cuda_runtime.h> #include<cuda.h> #include<stdio.h> #include<stdlib.h> #define intMin(a,b) ((a<b)?a:b) //Threads per block #define TPB 128 //blocks per grid #define BPG intMin(128, ((n+TPB-1)/TPB)) const int n = 4; __global__ void addVal(float *a, float *b, float *c){ int tid = blockIdx.x * blockDim.x + threadIdx.x; //Using shared memory to temporary store results __shared__ float cache[TPB]; float temp = 0; while(tid < n){ temp += a[tid] * b[tid]; tid += gridDim.x * blockDim.x; } cache[threadIdx.x] = temp; __syncthreads(); int i = blockDim.x/2; while( i !=0){ if(threadIdx.x < i){ cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ; } __syncthreads(); i = i/2; } if(threadIdx.x == 1){ c[blockIdx.x ] = cache[0]; } } int main(){ float a[n] , b[n] , c[BPG]; float *deva, *devb, *devc; int i; //Filling with random values to test for( i =0; i< n; i++){ a[i] = i; b[i] = i*2; } printf("Not using constant memory\n"); cudaMalloc((void**)&deva, n * sizeof(float)); cudaMalloc((void**)&devb, n * sizeof(float)); cudaMalloc((void**)&devc, BPG * sizeof(float)); cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); //Call function to do dot product addVal<<<BPG, TPB>>>(deva, devb, devc); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float time; cudaEventElapsedTime(&time,start, stop); printf("The elapsed time is: %f\n", time); //copy result back cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost); float sum =0 ; for ( i = 0 ; i< BPG; i++){ sum+=c[i]; } //display answer printf("%f\n",sum); getchar(); return 0; }