I'm using Unified Memory to simplify access to data on the CPU and GPU. As far as I know, cudaMallocManaged should allocate memory on the device. I wrote a simple code to check that:
#define TYPE float #define BDIMX 16 #define BDIMY 16 #include <cuda.h> #include <cstdio> #include <iostream> __global__ void kernel(TYPE *g_output, TYPE *g_input, const int dimx, const int dimy) { __shared__ float s_data[BDIMY][BDIMX]; int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int in_idx = iy * dimx + ix; // index for reading input int tx = threadIdx.x; // thread’s x-index into corresponding shared memory tile int ty = threadIdx.y; // thread’s y-index into corresponding shared memory tile s_data[ty][tx] = g_input[in_idx]; __syncthreads(); g_output[in_idx] = s_data[ty][tx] * 1.3; } int main(){ int size_x = 16, size_y = 16; dim3 numTB; numTB.x = (int)ceil((double)(size_x)/(double)BDIMX) ; numTB.y = (int)ceil((double)(size_y)/(double)BDIMY) ; dim3 tbSize; tbSize.x = BDIMX; tbSize.y = BDIMY; float* a,* a_out; cudaMallocManaged((void**)&a, size_x * size_y * sizeof(TYPE)); cudaMallocManaged((void**)&a_out, size_x * size_y * sizeof(TYPE)); kernel <<<numTB, tbSize>>>(a_out, a, size_x, size_y); cudaDeviceSynchronize(); return 0; } So I'm not even accessing the data on the CPU to avoid any page faults so the memory should supposedly be on the device memory. However when I run nvprof on this code, I get the following results:
invocations Metric Name Metric Description Min Max Avg Device "Tesla K40c (0)" Kernel: kernel(float*, float*, int, int) 1 local_load_transactions Local Load Transactions 0 0 0 1 local_store_transactions Local Store Transactions 0 0 0 1 shared_load_transactions Shared Load Transactions 8 8 8 1 shared_store_transactions Shared Store Transactions 8 8 8 1 gld_transactions Global Load Transactions 8 8 8 1 gst_transactions Global Store Transactions 8 8 8 1 sysmem_read_transactions System Memory Read Transactions 32 32 32 1 sysmem_write_transactions System Memory Write Transactions 34 34 34 1 tex_cache_transactions Texture Cache Transactions 0 0 0 1 dram_read_transactions Device Memory Read Transactions 0 0 0 1 dram_write_transactions Device Memory Write Transactions 0 0 0 So apparently the array is allocated on system memory and not the device memory. What am I missing here?