4

I have two programs. the only difference is that one uses constant memory to store input while the other uses global memory.I want to know why the global memory one is faster than the constant memory one? They both compute dot product btw 2 matrices

#include<cuda_runtime.h> #include<cuda.h> #include<stdio.h> #include<stdlib.h> #define intMin(a,b) ((a<b)?a:b) //Threads per block #define TPB 128 //blocks per grid #define BPG intMin(128, ((n+TPB-1)/TPB)) const int n = 4; __constant__ float deva[n],devb[n]; __global__ void addVal( float *c){ int tid = blockIdx.x * blockDim.x + threadIdx.x; //Using shared memory to temporary store results __shared__ float cache[TPB]; float temp = 0; while(tid < n){ temp += deva[tid] * devb[tid]; tid += gridDim.x * blockDim.x; } cache[threadIdx.x] = temp; __syncthreads(); int i = blockDim.x/2; while( i !=0){ if(threadIdx.x < i){ cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ; } __syncthreads(); i = i/2; } if(threadIdx.x == 1){ c[blockIdx.x ] = cache[0]; } } int main(){ float a[n] , b[n] , c[BPG]; //float *deva, *devb, *devc; float *devc; int i; //Filling with random values to test for( i =0; i< n; i++){ a[i] = i; b[i] = i*2; } //cudaMalloc((void**)&deva, n * sizeof(float)); //cudaMalloc((void**)&devb, n * sizeof(float)); cudaMalloc((void**)&devc, BPG * sizeof(float)); //cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice); cudaMemcpyToSymbol(deva, a, n * sizeof(float)); cudaMemcpyToSymbol(devb, b, n * sizeof(float)); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); //Call function to do dot product addVal<<<BPG, TPB>>>( devc); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float time; cudaEventElapsedTime(&time,start, stop); printf("The elapsed time is: %f\n", time); //copy result back cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost); float sum =0 ; for ( i = 0 ; i< BPG; i++){ sum+=c[i]; } //display answer printf("%f\n",sum); getchar(); return 0; } 

Below is the global memory version.

#include<cuda_runtime.h> #include<cuda.h> #include<stdio.h> #include<stdlib.h> #define intMin(a,b) ((a<b)?a:b) //Threads per block #define TPB 128 //blocks per grid #define BPG intMin(128, ((n+TPB-1)/TPB)) const int n = 4; __global__ void addVal(float *a, float *b, float *c){ int tid = blockIdx.x * blockDim.x + threadIdx.x; //Using shared memory to temporary store results __shared__ float cache[TPB]; float temp = 0; while(tid < n){ temp += a[tid] * b[tid]; tid += gridDim.x * blockDim.x; } cache[threadIdx.x] = temp; __syncthreads(); int i = blockDim.x/2; while( i !=0){ if(threadIdx.x < i){ cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ; } __syncthreads(); i = i/2; } if(threadIdx.x == 1){ c[blockIdx.x ] = cache[0]; } } int main(){ float a[n] , b[n] , c[BPG]; float *deva, *devb, *devc; int i; //Filling with random values to test for( i =0; i< n; i++){ a[i] = i; b[i] = i*2; } printf("Not using constant memory\n"); cudaMalloc((void**)&deva, n * sizeof(float)); cudaMalloc((void**)&devb, n * sizeof(float)); cudaMalloc((void**)&devc, BPG * sizeof(float)); cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); //Call function to do dot product addVal<<<BPG, TPB>>>(deva, devb, devc); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float time; cudaEventElapsedTime(&time,start, stop); printf("The elapsed time is: %f\n", time); //copy result back cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost); float sum =0 ; for ( i = 0 ; i< BPG; i++){ sum+=c[i]; } //display answer printf("%f\n",sum); getchar(); return 0; } 
2
  • To me it seems, that the two versions you postet are exactly identical. So you might want to either check that you have postet both versions. If I'm wrong about both versions being identical it would be very helpful if you could highlight where exactly the differences lie, to make it easier to find them. Furthermore the different generations of cuda devices vary wildly in their performance characteristica, so it might be helpful, if you would tell us, on which device you are experiencing this behaviour ("might", since I don't remember the details of cuda constant memory, I'm not sure) Commented Jul 9, 2011 at 22:40
  • thanks fr pointing that out dude. I jus edited teh code Commented Jul 9, 2011 at 22:51

1 Answer 1

9

You are not getting advantage of the constant memory.

  • A single read from constant memory can be broadcast to a half-warp (not your case as every thread load from its own tid).
  • Constant memory is cached (not used in your case as you only read once from each position in the constant memory array).

As each thread in a half-warp does a single read to different data, the 16 different reads get serialized, taking 16 times the amount of time to place the request.

If they are reading from global memory, the request are done at the same time, coalesced. That's why your global memory example is better than the constant memory.

Of course, this conclusion can vary with devices of compute capability 2.x with a L1 and L2 cache.

Regards!

Sign up to request clarification or add additional context in comments.

1 Comment

For compute capability 2.0 (sm_20) GPUs and later, replace "half-warp" above with "warp", and "16" with "32".

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.