(Updated) I am trying to understand how CUDA parallelism works in CuFFT while learning CUDA coding.
I wrote my version of 1-D FFT in CUDA C++ and compared it with cuFFT. Below are the throughputs I got when I compared the two methods for a single block FFT tests.
1-D FFT Size: throughput-1 (time) in my version, throughput-2 (time) in CuFFT. Gf = GFlops (using cudaEventElapsedTime, GTX 1050, includes cufftPlan1d in cuFFT - see code below)
(1)2^16: 1.3 Gf(3.8 msec), 0.19 Gf(27 msec) (2)2^17: 1.6 Gf(6.7 msec), 0.5 Gf(23 msec) (3)2^18: 1.8 Gf(13 msec), 1.1 Gf(22 msec) (4)2^19: 1.8 Gf(27 msec), 2.1 Gf(23 msec) (5)2^20: 1.8 Gf(56 msec), 4.5 Gf(23 msec) (6)2^21: 1.8 Gf(121 msec), 9.6 Gf(22 msec) (7)2^23: 1.8 Gf(523 msec), 31 Gf (30 msec) (8)2^25: 1.8 Gf(2298 msec), 106 Gf(39 msec) I am wondering why the throughput from CuFFT increases almost linearly as the transform size? Since the throughput is computed by computational complexity of the transform divided by computation time, I thought it should stay almost constant as my version shows if everything remain same. What technique should I explore to improve my version behave like 1-D CuFFT (grid size strategy)? My code is included below ( I think the code works fine since FFT-IFFT shows the errors less than 1e-8 where the computation is double precision with the input vector data[i] = Complex(i.0, 0))
#include <iostream> #include <vector> #include <cuda_runtime.h> #include <thrust/complex.h> #include <cmath> #include <iomanip> // For std::fixed and std::setprecision #include <cufft.h> using Complex = thrust::complex<double>; #define USE_SHARED_MEM 0 // disabled: using shared memory didn't improve performance #define BLOCK_SIZE 1024 // #define BLOCK_SIZE 512 // #define BLOCK_SIZE 32 #if USE_SHARED_MEM #define SHARED_MEM_SIZE_FACTOR 4 // relative to BLOCK_SIZE #endif unsigned int mixedRadixReverse(unsigned int x, int numBits) { unsigned int rev = 0; int numRadix4Stages = numBits / 2; // Number of radix-4 stages int useRadix2 = numBits % 2; // If numBits is odd, we have an extra radix-2 stage // First, reverse the radix-2 bit if present (apply to LSB first) if (useRadix2) { rev = (rev << 1) | (x & 1); // Extract LSB for radix-2 x >>= 1; } // Then, reverse the radix-4 groups (higher bits) for (int i = 0; i < numRadix4Stages; i++) { rev = (rev << 2) | (x & 3); // Extract 2 bits at a time x >>= 2; } return rev; } __global__ void scale(Complex* d_data, float scale, int N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < N) { d_data[idx] *= scale; } } __global__ void fft_radix2(Complex* data, int sizeFFT, int inFFTSize, int direction) { int k = blockIdx.x * blockDim.x + threadIdx.x; if (k >= sizeFFT / 2) return; int outFFTSize = 2*inFFTSize; float sign = direction == 1 ? -1.0: 1.0; Complex tw = Complex(cos(2*M_PI*k/outFFTSize), sign*sin(2*M_PI*k/outFFTSize)); Complex t = data[k + inFFTSize]*tw; Complex tmp = data[k]; data[k] = tmp + t; data[k + inFFTSize] = tmp - t; } __global__ void fft_radix4(Complex* data, int sizeFFT, int inFFTSize, int direction) { float sign = direction == 1 ? -1.0: 1.0; int outFFTSize = 4*inFFTSize; int k = blockIdx.x * blockDim.x + threadIdx.x; if (k >= sizeFFT / 4) return; // max number of threads is sizeFFT /4 for radix-4 stage // k = 0, 1, 2, ..., 7 when sizeFFT = 32 int k_offset_inFFTSize = k%inFFTSize; // k_inFFTSize= 0, 1 ..., 3 when inFFTSize = 4 // printf("k=%d: inFFTSize=%d, outFFTSize=%d\n",k, inFFTSize,outFFTSize); int blockOffset = (k/inFFTSize)*outFFTSize; // replaces outer loop int offset = blockOffset + k_offset_inFFTSize; // replaces inner loop // dragonfly inputs int idx0 = offset; int idx1 = offset + inFFTSize; int idx2 = offset + 2 * inFFTSize; int idx3 = offset + 3 * inFFTSize; Complex tw = Complex(cos(2.0*M_PI*k_offset_inFFTSize/outFFTSize), sign*sin(2.0*M_PI*k_offset_inFFTSize/outFFTSize)); Complex t0 = data[idx0]; Complex t1 = data[idx1]*tw; Complex t2 = data[idx2]*tw*tw; Complex t3 = data[idx3]*tw*tw*tw; Complex a = t0 + t2; Complex b = t1 + t3; Complex c = t0 - t2; Complex d = (t1 - t3) * Complex(0, 1.0*sign); // in-place computation data[idx0] = a + b; data[idx1] = c + d; data[idx2] = a - b; data[idx3] = c - d; } __global__ void fft_radix2_shared(Complex* data, int sizeFFT, int inFFTSize, int direction) { extern __shared__ Complex s_data[]; int k = blockIdx.x * blockDim.x + threadIdx.x; if (k >= sizeFFT / 2) return; int outFFTSize = 2 * inFFTSize; float sign = (direction == 1) ? -1.0 : 1.0; Complex tw = Complex(cos(2 * M_PI * k / outFFTSize), sign * sin(2 * M_PI * k / outFFTSize)); // Load data into shared memory int idx1 = k + inFFTSize; s_data[threadIdx.x] = data[k]; s_data[threadIdx.x + blockDim.x] = data[idx1]; __syncthreads(); // Perform FFT operation in shared memory Complex t = s_data[threadIdx.x + blockDim.x] * tw; Complex tmp = s_data[threadIdx.x]; s_data[threadIdx.x] = tmp + t; s_data[threadIdx.x + blockDim.x] = tmp - t; __syncthreads(); // Write back to global memory data[k] = s_data[threadIdx.x]; data[idx1] = s_data[threadIdx.x + blockDim.x]; } __global__ void fft_radix4_shared(Complex* data, int sizeFFT, int inFFTSize, int direction) { extern __shared__ Complex s_data[]; float sign = (direction == 1) ? -1.0 : 1.0; int outFFTSize = 4 * inFFTSize; int k = blockIdx.x * blockDim.x + threadIdx.x; if (k >= sizeFFT / 4) return; int k_offset_inFFTSize = k % inFFTSize; int blockOffset = (k / inFFTSize) * outFFTSize; int offset = blockOffset + k_offset_inFFTSize; // Indices for dragonfly inputs int idx0 = offset; int idx1 = offset + inFFTSize; int idx2 = offset + 2 * inFFTSize; int idx3 = offset + 3 * inFFTSize; // Load into shared memory s_data[threadIdx.x] = data[idx0]; s_data[threadIdx.x + blockDim.x] = data[idx1]; s_data[threadIdx.x + 2 * blockDim.x] = data[idx2]; s_data[threadIdx.x + 3 * blockDim.x] = data[idx3]; __syncthreads(); // Twiddle factors Complex tw = Complex(cos(2.0 * M_PI * k_offset_inFFTSize / outFFTSize), sign * sin(2.0 * M_PI * k_offset_inFFTSize / outFFTSize)); Complex t0 = s_data[threadIdx.x]; Complex t1 = s_data[threadIdx.x + blockDim.x] * tw; Complex t2 = s_data[threadIdx.x + 2 * blockDim.x] * tw * tw; Complex t3 = s_data[threadIdx.x + 3 * blockDim.x] * tw * tw * tw; // Compute FFT butterfly operations Complex a = t0 + t2; Complex b = t1 + t3; Complex c = t0 - t2; Complex d = (t1 - t3) * Complex(0, sign); // Store results back to shared memory s_data[threadIdx.x] = a + b; s_data[threadIdx.x + blockDim.x] = c + d; s_data[threadIdx.x + 2 * blockDim.x] = a - b; s_data[threadIdx.x + 3 * blockDim.x] = c - d; __syncthreads(); // Write results back to global memory data[idx0] = s_data[threadIdx.x]; data[idx1] = s_data[threadIdx.x + blockDim.x]; data[idx2] = s_data[threadIdx.x + 2 * blockDim.x]; data[idx3] = s_data[threadIdx.x + 3 * blockDim.x]; } void fft_1d(Complex* data, int sizeFFT, int direction) { //(1) Host memory allocation for radix reversal and DMA Complex * h_pinned_data; cudaMallocHost((void**)&h_pinned_data, sizeFFT*sizeof(Complex)); // radix-reverse in the host memory int numBits = __builtin_ctz(sizeFFT); #pragma omp parallel for // Parallel processing for CPU speedup for (int i = 0; i < sizeFFT; i++) { unsigned int revIdx = mixedRadixReverse(i, numBits); h_pinned_data[revIdx] = data[i]; } //(2) device memory allocaiton Complex* d_data; cudaError_t err; err = cudaMalloc((void**)&d_data, sizeFFT * sizeof(Complex)); if (err != cudaSuccess) { std::cerr << "cudaMalloc failed: " << std::endl; return; } // Asynchronous copy using pinned memory for performance boost cudaMemcpyAsync(d_data, h_pinned_data, sizeFFT * sizeof(Complex), cudaMemcpyHostToDevice, 0); // (3) plan // dim3 blockSize(256); dim3 blockSize(BLOCK_SIZE); dim3 gridSize((sizeFFT + blockSize.x - 1) / blockSize.x); int numStagesRadix4 = 0; int temp_sizeFFT = sizeFFT; while (temp_sizeFFT % 4 == 0) { temp_sizeFFT /= 4; numStagesRadix4++; } // std::cout << "number of radix4 stages = " << numStagesRadix4 << std::endl; #if USE_SHARED_MEM // shared memory allocation size_t sharedMemSize = blockSize.x * sizeof(Complex) *SHARED_MEM_SIZE_FACTOR;//256;//4; #endif // (4) Call radix computations int inFFTSize = 1; for (int i = 0; i < numStagesRadix4; ++i) { printf("radix-4 stage = %d\n", i); #if USE_SHARED_MEM fft_radix4_shared<<<gridSize, blockSize, sharedMemSize>>>(d_data, sizeFFT, inFFTSize, direction); #else fft_radix4<<<gridSize, blockSize>>>(d_data, sizeFFT, inFFTSize, direction); #endif err = cudaDeviceSynchronize(); if (err != cudaSuccess) { std::cerr << "fft_radix4 execution at stage " << i << " failed!" << std::endl; return; } inFFTSize *= 4; // display_device_array_cmplx(d_data, sizeFFT, "d_data ="); } if (temp_sizeFFT == 2) { std::cout << "final radix2 stage: inFFTSize = " << inFFTSize << std::endl; #if USE_SHARED_MEM fft_radix2_shared<<<gridSize, blockSize, sharedMemSize>>>(d_data, sizeFFT, inFFTSize, direction); #else fft_radix2<<<gridSize, blockSize>>>(d_data, sizeFFT, inFFTSize, direction); #endif err = cudaDeviceSynchronize(); if (err != cudaSuccess) { std::cerr << "fft_radix2 execution failed!" << std::endl; return; } } // (5) scale if necessary if (direction == -1) scale<<<gridSize, blockSize>>>(d_data, 1.0/(float) sizeFFT, sizeFFT); // send the result back to host err = cudaMemcpy(data, d_data, sizeFFT * sizeof(Complex), cudaMemcpyDeviceToHost); if (err != cudaSuccess) { std::cerr << "cudaMemcpyDeviceToHost failed!" << std::endl; return; } cudaFree(d_data); } int main() { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); float milliseconds = 0; // define input data ================== // int sizeFFT = 1<< 28; //Not working: cudaMalloc fails // int sizeFFT = 1<< 27; // int sizeFFT = 1<< 26; // int sizeFFT = 1<< 25; // int sizeFFT = 1<< 23; // int sizeFFT = 1<< 21; // int sizeFFT = 1<< 20; // int sizeFFT = 1<< 19; // int sizeFFT = 1<< 18; // int sizeFFT = 1<< 17; int sizeFFT = 1<< 16; // test vector std::vector<Complex> data(sizeFFT, Complex(1.0, 0.0)); std::vector<Complex> data1(sizeFFT, Complex(0.0, 0.0)); for (int i = 0; i < sizeFFT; ++i) {data[i] = Complex(i,0);} data1 = data; cudaEventRecord(start); // =============> // FFT int direction = 1; // 1: forward, -1: reverse fft_1d(data.data(), sizeFFT, direction); cudaEventRecord(stop); // <=============== cudaEventSynchronize(stop); cudaEventElapsedTime(&milliseconds, start, stop); double N = sizeFFT; double gflops = (5.0 * N * log2(N)) / (milliseconds * 1.0e6); std::cout << "Throughput: " << gflops << " GFLOPS, time:" << milliseconds<< ", BLOCK_SIZE:" << BLOCK_SIZE << std::endl; /////////////////////////////////////////// // Comparison with cuFFT /////////////////////////////////////////// //(1) copy data cufftComplex* d_data; cudaError_t err; err = cudaMalloc((void**)&d_data, sizeFFT * sizeof(cufftComplex)); if (err != cudaSuccess) { std::cerr << "cudaMalloc failed: " << std::endl; return; } // Asynchronous copy using pinned memory for performance boost cudaMemcpyAsync(d_data, data.data(), sizeFFT * sizeof(cufftComplex), cudaMemcpyHostToDevice, 0); //(2) Create cuFFT plan cudaEventRecord(start); // =============> cufftHandle plan; cufftPlan1d(&plan, sizeFFT, CUFFT_C2C, 1); size_t workspace_size; cufftGetSize(plan, &workspace_size); std::cout << "Workspace size: " << workspace_size/1024 << " K bytes" << std::endl; //(3) Execute FFT cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD); cudaEventRecord(stop); // <=============== cufftDestroy(plan); cudaFree(d_data); cudaEventSynchronize(stop); cudaEventElapsedTime(&milliseconds, start, stop); N = sizeFFT; gflops = (5.0 * N * log2(N)) / (milliseconds * 1.0e6); std::cout << "Throughput (cuFFT): " << gflops << " GFLOPS, time:" << milliseconds << " msec" <<std::endl; // IFFT direction = -1; // inverse fft_1d(data.data(), sizeFFT, direction); double maxDiff = 0; for (int i = 0; i < sizeFFT; ++i) { thrust::complex<double> err = data[i] - data1[i]; double errSquare = err.real()*err.real() + err.imag()*err.imag() ; maxDiff = fmax(maxDiff, sqrt(errSquare)); } std::cout << "Max difference: " << maxDiff << std::endl; return 0; }
sin(pi/2^N)and2*(sin(pi/2^(N+1))^2. Traditionaly using the identitycos(x) = 1 - 2*sin(x/2)^2to preserve accuracy whenxis small.