CUDATestFunction = CUDAFunctionLoad[...,{"Float", 2, "Input""InputOutput"},{16,16}]; Real Code for the actual problem with some modification to simplify the input (which does not change the occurring problem):
(*data initialization*) imageData = Table[RandomReal[], {i, 200}, {j, 300}]; (*this would usually be some GrayScale ImageData*) f = Table[RandomReal[], {i, 200}, {j, 300}]; (*this would be some function evaluated on the imageData*) maxiter = 10000; testImageData = Table[{RandomReal[], RandomReal[]}, {i, 200}, {j, 300}]; (*would be the gradient of the imageData*) (*the for loop that leads to the memory overflow*) For[i = 1, i < maxiter, i++, test = First@CUDATestFunction[testImageData, 0.1, f, imageGradientNormalized, 1, Sequence @@ Dimensions[imageData]]; testImageData = test; ] (*CUDACode*) CUDATestFunction = CUDAFunctionLoad[" __device__ float length(const float2& a) { return sqrtf(a.x*a.x+a.y*a.y); } __device__ float2 operator+(const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); } __device__ float2 operator-(const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); } __device__ float2 operator*(const float& a, const float2& b) { return make_float2(a * b.x, a * b.y); } __device__ float2 operator/(const float2& a, const float& b) { return make_float2(a.x / b, a.y / b); } __global__ void resolvFs(float* p, float sigma, float* f, float* imageGradientNormalized, float lambda1, mint width, mint height) { int xIndex = threadIdx.x + blockIdx.x * blockDim.x; int yIndex = threadIdx.y + blockIdx.y * blockDim.y; int index = 2*(xIndex + yIndex * width); if(xIndex < width && yIndex < height) { float2 vecP = make_float2(p[index], p[index+1]); float2 vecN = make_float2(imageGradientNormalized[index], imageGradientNormalized[index+1]); vecP = vecP + 2 * sqrtf(lambda1) * vecN; vecP = vecP/fmaxf(1, length(vecP)/(2*sqrtf(f[index/2]+lambda1))); vecP = vecP - 2 * sqrtf(lambda1) * vecN; p[index] = vecP.x; p[index+1] = vecP.y; } }", "resolvFs", {{"Float", 3, "InputOutput"}, "Float", {"Float", 2, "Input"}, {"Float", 3, "Input"}, "Float", _Integer, _Integer}, {16, 16}]