1

I'm using Unified Memory to simplify access to data on the CPU and GPU. As far as I know, cudaMallocManaged should allocate memory on the device. I wrote a simple code to check that:

#define TYPE float #define BDIMX 16 #define BDIMY 16 #include <cuda.h> #include <cstdio> #include <iostream> __global__ void kernel(TYPE *g_output, TYPE *g_input, const int dimx, const int dimy) { __shared__ float s_data[BDIMY][BDIMX]; int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int in_idx = iy * dimx + ix; // index for reading input int tx = threadIdx.x; // thread’s x-index into corresponding shared memory tile int ty = threadIdx.y; // thread’s y-index into corresponding shared memory tile s_data[ty][tx] = g_input[in_idx]; __syncthreads(); g_output[in_idx] = s_data[ty][tx] * 1.3; } int main(){ int size_x = 16, size_y = 16; dim3 numTB; numTB.x = (int)ceil((double)(size_x)/(double)BDIMX) ; numTB.y = (int)ceil((double)(size_y)/(double)BDIMY) ; dim3 tbSize; tbSize.x = BDIMX; tbSize.y = BDIMY; float* a,* a_out; cudaMallocManaged((void**)&a, size_x * size_y * sizeof(TYPE)); cudaMallocManaged((void**)&a_out, size_x * size_y * sizeof(TYPE)); kernel <<<numTB, tbSize>>>(a_out, a, size_x, size_y); cudaDeviceSynchronize(); return 0; } 

So I'm not even accessing the data on the CPU to avoid any page faults so the memory should supposedly be on the device memory. However when I run nvprof on this code, I get the following results:

 invocations Metric Name Metric Description Min Max Avg Device "Tesla K40c (0)" Kernel: kernel(float*, float*, int, int) 1 local_load_transactions Local Load Transactions 0 0 0 1 local_store_transactions Local Store Transactions 0 0 0 1 shared_load_transactions Shared Load Transactions 8 8 8 1 shared_store_transactions Shared Store Transactions 8 8 8 1 gld_transactions Global Load Transactions 8 8 8 1 gst_transactions Global Store Transactions 8 8 8 1 sysmem_read_transactions System Memory Read Transactions 32 32 32 1 sysmem_write_transactions System Memory Write Transactions 34 34 34 1 tex_cache_transactions Texture Cache Transactions 0 0 0 1 dram_read_transactions Device Memory Read Transactions 0 0 0 1 dram_write_transactions Device Memory Write Transactions 0 0 0 

So apparently the array is allocated on system memory and not the device memory. What am I missing here?

2
  • 2
    Do you have multiple GPUs in your system? UM behaves differently when there are multiple GPUs in the system which are not P2P capable. If that is the case try profiling your code with CUDA_VISIBLE_DEVICES="0" Commented Apr 21, 2016 at 21:15
  • You should provide some basic info about your hardware and environment ;) Commented Apr 22, 2016 at 8:36

2 Answers 2

3

Managed memory really does allocate physical memory on the GPU. You can confirm yourself this is the case by doing something like the following to your code:

#include <iostream> void report_gpu_mem() { size_t free, total; cudaMemGetInfo(&free, &total); std::cout << "Free = " << free << " Total = " << total <<std::endl; } int main() { float* a,* a_out; size_t sz = 1 << 24; // 16Mb report_gpu_mem(); cudaMallocManaged((void**)&a, sz); report_gpu_mem(); cudaMallocManaged((void**)&a_out, sz); report_gpu_mem(); cudaFree(a); report_gpu_mem(); cudaFree(a_out); report_gpu_mem(); return cudaDeviceReset(); } 

Which is now allocating 16Mb for each of two managed allocations, and then freeing them. No host or device access occurs, so there should be no triggered transfers or synchronisation. The size is large enough that it should exceed the minimum granularity of the GPU memory manager and trigger changes in visible free memory. Compiling and running it does this:

$ nvcc -arch=sm_52 sleepy.cu $ CUDA_VISIBLE_DEVICES="0" ./a.out Free = 4211929088 Total = 4294770688 Free = 4194869248 Total = 4294770688 Free = 4178092032 Total = 4294770688 Free = 4194869248 Total = 4294770688 Free = 4211654656 Total = 4294770688 

The physical free memory on the GPU is clearly being incremented and decremented by 16Mb at each alloc/free.

Sign up to request clarification or add additional context in comments.

Comments

0

Things have changed in new cuda version. In my RTX 3070 and cuda 12.3, the code result in

Free = 8173191168 Total = 8361017344 Free = 8173191168 Total = 8361017344 Free = 8173191168 Total = 8361017344 Free = 8173191168 Total = 8361017344 Free = 8173191168 Total = 8361017344 

if I use the allocated memory with a simple assginment,

#include <iostream> void report_gpu_mem() { size_t free, total; cudaMemGetInfo(&free, &total); std::cout << "Free = " << free << " Total = " << total <<std::endl; } __global__ void usememory(float* a) { a[0] = 1 ; return; } int main() { float* a,* a_out; size_t sz = 1 << 24; // 16Mb report_gpu_mem(); cudaMallocManaged((void**)&a, sz); usememory<<<1, 1>>>(a); report_gpu_mem(); cudaMallocManaged((void**)&a_out, sz); report_gpu_mem(); cudaFree(a); report_gpu_mem(); cudaFree(a_out); report_gpu_mem(); return cudaDeviceReset(); } 

I got

Free = 8173191168 Total = 8361017344 Free = 8173191168 Total = 8361017344 Free = 8173191168 Total = 8361017344 Free = 8041070592 Total = 8361017344 Free = 8041070592 Total = 8361017344 

It seems that there is something like lazy copy? And I don't know why the gpu memory become less when the array is freed.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.