A slight improvement over @orthopteroid's answer, which pretty much ensures a unique temporary file is generated, and only requires one instead of two temporary files.
The following goes into scripts/get_cuda_sm.sh:
#!/bin/bash # # Prints the compute capability of the first CUDA device installed # on the system, or alternatively the device whose index is the # first command-line argument device_index=${1:-0} timestamp=$(date +%s.%N) gcc_binary=$(which g++) gcc_binary=${gcc_binary:-g++} cuda_root=${CUDA_DIR:-/usr/local/cuda} CUDA_INCLUDE_DIRS=${CUDA_INCLUDE_DIRS:-${cuda_root}/include} CUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY:-${cuda_root}/lib64/libcudart.so} generated_binary="/tmp/cuda-compute-version-helper-$$-$timestamp" # create a 'here document' that is code we compile and use to probe the card source_code="$(cat << EOF #include <stdio.h> #include <cuda_runtime_api.h> int main() { cudaDeviceProp prop; cudaError_t status; int device_count; status = cudaGetDeviceCount(&device_count); if (status != cudaSuccess) { fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status)); return -1; } if (${device_index} >= device_count) { fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", ${device_index}, device_count); return -1; } status = cudaGetDeviceProperties(&prop, ${device_index}); if (status != cudaSuccess) { fprintf(stderr,"cudaGetDeviceProperties() for device ${device_index} failed: %s\n", cudaGetErrorString(status)); return -1; } int v = prop.major * 10 + prop.minor; printf("%d\\n", v); } EOF )" echo "$source_code" | $gcc_binary -x c++ -I"$CUDA_INCLUDE_DIRS" -o "$generated_binary" - -x none "$CUDA_CUDART_LIBRARY" # probe the card and cleanup $generated_binary rm $generated_binary
and the following goes into CMakeLists.txt or a CMake module:
if (NOT CUDA_TARGET_COMPUTE_CAPABILITY) if("$ENV{CUDA_SM}" STREQUAL "") set(ENV{CUDA_INCLUDE_DIRS} "${CUDA_INCLUDE_DIRS}") set(ENV{CUDA_CUDART_LIBRARY} "${CUDA_CUDART_LIBRARY}") set(ENV{CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}") execute_process(COMMAND bash -c "${CMAKE_CURRENT_SOURCE_DIR}/scripts/get_cuda_sm.sh" OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY_) else() set(CUDA_TARGET_COMPUTE_CAPABILITY_ $ENV{CUDA_SM}) endif() set(CUDA_TARGET_COMPUTE_CAPABILITY "${CUDA_TARGET_COMPUTE_CAPABILITY_}" CACHE STRING "CUDA compute capability of the (first) CUDA device on \ the system, in XY format (like the X.Y format but no dot); see table \ of features and capabilities by capability X.Y value at \ https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications") execute_process(COMMAND bash -c "echo -n $(echo ${CUDA_TARGET_COMPUTE_CAPABILITY})" OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY) execute_process(COMMAND bash -c "echo ${CUDA_TARGET_COMPUTE_CAPABILITY} | sed 's/^\\([0-9]\\)\\([0-9]\\)/\\1.\\2/;' | xargs echo -n" OUTPUT_VARIABLE FORMATTED_COMPUTE_CAPABILITY) message(STATUS "CUDA device-side code will assume compute capability \ ${FORMATTED_COMPUTE_CAPABILITY}") endif() set(CUDA_GENCODE "arch=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}, code=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}") set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode ${CUDA_GENCODE} )
nvidia-smi), then build a list of-archflags based on the results? Especially in a cluster, the build system may contain a completely different GPU than the GPU-enabled cluster nodes, or even no GPU at all.