I'm trying to do differential evolution on CUDA, but the problem is that kernel which is responsible for "Mutation, Crossover, Evaluation, Selection" never gets launched.

Any help?

Here's the entire code:

 	#include <iostream>
	#include <curand_kernel.h>
	using namespace std;

	/**** ERROR HANDLING ****/
	static void HandleError(cudaError_t err,const char *file, int line )
	{
		if (err != cudaSuccess) {
			printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
					file, line );
			system("pause");
			exit( EXIT_FAILURE );
		}
	}
	#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


	/**** HOST AND DEVICE CONSTANTS****/
	const int hNP=100, hD=31, hN=10;
	__constant__ int NP, D, N;
	__constant__ float Cr, F;


	/*** EVAL FUNCTION******/
	__device__ float lennardJones(float a[3], float b[3]) {

		float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
							 + (a[1] - b[1]) * (a[1] - b[1])
							 + (a[2] - b[2]) * (a[2] - b[2]));
		float distance6 = distance * distance * distance
						 * distance * distance * distance;
		float distance12 = distance6 * distance6;
		return 1/distance12 - 2/distance6;
	}

	/**** RANDOM GENERATORS***/
	__device__ float rndFloat(curandState* globalState, int id) 
	{
		curandState localState = globalState[id];
		float RANDOM = curand_uniform(&localState);
		globalState[id] = localState; 
		return RANDOM;
	}
	__device__ int rndInt(curandState* globalState, int id, int max) 
	{
		curandState localState = globalState[id];
		float RANDOM = curand_uniform(&localState);
		globalState[id] = localState; 
		return RANDOM*max;
	}
	__device__ float rndFloat(curandState* globalState, int id, int max) 
	{
		curandState localState = globalState[id];
		float RANDOM = curand_uniform(&localState);
		globalState[id] = localState; 
		return RANDOM*max;
	}
	__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
	{
		curandState localState = globalState[id];
		float RANDOM = curand_uniform(&localState);
		globalState[id] = localState; 
		return min+RANDOM*(max-min);
	}
	/*** SEEDS ****/
	__global__ void setup_kernel (curandState * state, unsigned long seed)
	{
		int id= threadIdx.x+blockIdx.x*blockDim.x;
		if(id < NP)
			curand_init(seed, id, 0,&state[id]);
	} 

	/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
	__global__ void kernelE(curandState* globalState, float *population)
	{
		int id= threadIdx.x+blockIdx.x*blockDim.x;
		if(id < NP)
		{
			//init, just populating array with some specific numbers
			population[D*id]=0;
			population[D*id+N]=0;
			population[D*id +2*N]=0;
			
			population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
			population[D*id+N+1]=0;
			population[D*id +2*N+1]=0;

			for(int i=2; i<N; i++){
				float min= -4 - 1/4*abs((int)((i-4)/3));
				float max= 4 + 1/4*abs((int)((i-4)/3));
				if(i==2)
				{
					population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
					population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
					population[D*id +2*N+2]=0;
				}
				else
				{
					population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
					population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
					population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
				}
			}

			//eval
			float e=0;
			for(int i=0; i<N; i++)
			{
				for(int j=0; j<i; j++)
				{
					float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
					e += lennardJones(a,b);
				}
			}
			population[D*id + D-1]=e;
		}
	}
	/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
	__global__ void kernelP(curandState* globalState, int *mutation)
	{
		int id= threadIdx.x+blockIdx.x*blockDim.x;
		if(id<NP)
		{
			int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
			while(a == id){a = rndInt(globalState, id, NP);}
			while(b == a && b==id){b=rndInt(globalState, id, NP);}
			while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
			mutation[3*id+0]=a;
			mutation[3*id+1]=b;
			mutation[3*id+2]=c;
		}
	}
	/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
	__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
	{
		int id= threadIdx.x+blockIdx.x*blockDim.x;	
		if(id<NP)
		{
			int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];

			//DE mutation and crossover
			int j=rndInt(globalState, id, NP);
			for(int i=0; i<D-1; i++)
			{
				//DE mutation
				pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
				//DE crossover
				if(Cr > rndFloat(globalState, id) && i!= j)
					pop[D*id+i]=population[D*id +i];
			}
			// Eval
			pop[D*id+D-1]=0;
			for(int i=0; i<N; i++)
			{
				for(int j=0; j<i; j++)
				{
					float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
					pop[D*id+D-1] += lennardJones(a,b);
				}
			}
			
			__syncthreads();
			//DE selection
			if(pop[D*id+D-1] < population[D*id +D-1])
			{
				for(int i=0; i<D; i++)
					population[D*id +i]=pop[D*id+i];
			}
		}
	}

	void getBestScore(float *hpopulation)
	{
		int max=0;
		for(int i=1; i<hNP; i++)
		{
			if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
				max=i;
		}
		for(int j=0; j<hN; j++)
			cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
		cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
	}

	int main()
	{
		cudaEvent_t start,stop;
		HANDLE_ERROR(cudaEventCreate(&start));
		HANDLE_ERROR(cudaEventCreate(&stop)); 
		HANDLE_ERROR(cudaEventRecord(start,0));

		int device, st=12;
		float hCr=0.6, hF=0.8;
		cudaDeviceProp prop;
		HANDLE_ERROR(cudaGetDevice(&device));
		HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
		int SN = prop.maxThreadsPerBlock; //512 threads per block
		int SB = (hNP+(SN-1))/SN;
		

		//constants NP, D, N, Cr, F
		HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
		HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
		HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
		HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
		HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
		
		//seeds
		curandState* devStates;
		HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
		setup_kernel <<< 1, hNP>>> (devStates, 50);

		//population
		float *population, *pop;
		float hpopulation[hNP*hD];
		HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
		HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
		
		//mutation
		int *mutation, *mutation1;
		int *hmutation;
		HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
		HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
		HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
		
		//stream
		cudaStream_t stream_i, stream_j;
		HANDLE_ERROR(cudaStreamCreate(&stream_i));
		HANDLE_ERROR(cudaStreamCreate(&stream_j));
		
		kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
		kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);

		while(st != 0)
		{
			/*** COPYING MUTATION INDICES***/
			HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
			HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
			
			/**** CALLING KERNELS****/
			kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
			kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
			st--;
			
			//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
			//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
			//getBestScore(hpopulation);
			//cin.get();
		}
		
		HANDLE_ERROR(cudaStreamSynchronize(stream_i));
		HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
		getBestScore(hpopulation);

		cudaEventRecord(stop,0);
		cudaEventSynchronize(stop);
		float time;
		HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
		cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

		HANDLE_ERROR(cudaEventDestroy(start));
		HANDLE_ERROR(cudaEventDestroy(stop));
		HANDLE_ERROR(cudaStreamDestroy(stream_i));
		HANDLE_ERROR(cudaStreamDestroy(stream_j));
		HANDLE_ERROR(cudaFree(population));
		HANDLE_ERROR(cudaFree(pop));
		HANDLE_ERROR(cudaFreeHost(hmutation));
		HANDLE_ERROR(cudaFree(mutation1));
		HANDLE_ERROR(cudaFree(devStates));

		system("pause");
		return 0;
	}