The preface to this question is that, quoting CUDA C Programming Guide,
the fewer registers the kernel uses, the more threads and thread blocks are likely to be on the multiprocessor, which can improve performance.
__launch_bounds__ maxregcount .
__launch_bounds__
nvcc , __global__, . -, " " . , () , .
#define MAX_THREADS_PER_BLOCK 256
#define MIN_BLOCKS_PER_MP 2
__global__ void
__launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
fooKernel(int *inArr, int *outArr)
{
}
, nvcc "" .
MAX_THREADS_PER_BLOCK , MIN_BLOCKS_PER_MP . , , MAX_THREADS_PER_BLOCK, .
Programming Guide :
, L , , minBlocksPerMultiprocessor ( minBlocksPerMultiprocessor ) maxThreadsPerBlock . :
, __launch_bounds__ .
maxrregcount
maxrregcount - , , , __launch_bounds__, . , , DRAM. DRAM, L1, L2.