Randomly delayed instructions in CUDA

Question

Randomly delayed instructions in CUDA

About __shfl() instruction delay:

Does the following statement

 c=__shfl(c, indi); /* where indi is any integer number(may be random (<32)), and is different for different LaneID. */

has the same delay as for:

 c=__shfl_down(c,1);

+4

gpu cuda nvidia

user2188453 Aug 16 '13 at 23:06

source share

2 answers

Robert Crovella · Answer 1 · 2013-08-17T13:59:46+0000

All warp-shuffle instructions have the same performance .

Jackolantern · Answer 2 · 2014-06-06T05:37:18+0000

To give a “quantitative” answer to Robert’s answer, let's look at the Marx Harris pruning approach using the CUDA shuffling operations detailed in Kepler’s faster concurrent pruning .

In this approach, base reduction is done with __shfl_down . An alternative approach to reducing deformation uses __shfl_xor according to Lecture 4: warp shuf fl es and reduce / scan operations . Below I report the complete code that implements both approaches. If they are tested on a Kepler K20c, both take 0.044ms to reduce the array of N=200000 float elements. Accordingly, both approaches outperform Thrust reduce by two orders of magnitude, since the execution time for the Thrust case is 1.06ms for the same test.

Here is the complete code:

 #include <thrust\device_vector.h> #define warpSize 32 /***********************************************/ /* warpReduceSum PERFORMING REDUCTION PER WARP */ /***********************************************/ __forceinline__ __device__ float warpReduceSum(float val) { for (int offset = warpSize/2; offset > 0; offset /= 2) val += __shfl_down(val, offset); //for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i); return val; } /*************************************************/ /* blockReduceSum PERFORMING REDUCTION PER BLOCK */ /*************************************************/ __forceinline__ __device__ float blockReduceSum(float val) { // --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be // 1024, so that there will be at most 32 warps per each block. static __shared__ float shared[32]; int lane = threadIdx.x % warpSize; // Thread index within the warp int wid = threadIdx.x / warpSize; // Warp ID // --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result val = warpReduceSum(val); // --- Only the threads with 0 index within the warp write the warp result to shared memory if (lane==0) shared[wid]=val; // Write reduced value to shared memory // --- Wait for all warp reductions __syncthreads(); // --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only // the corresponding warp existed, otherwise the partial sum is set to zero. val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; // --- The first warp performs the final partial warp summation. if (wid==0) val = warpReduceSum(val); return val; } /********************/ /* REDUCTION KERNEL */ /********************/ __global__ void deviceReduceKernel(float *in, float* out, int N) { float sum = 0.f; // --- Reduce multiple elements per thread. for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) sum += in[i]; sum = blockReduceSum(sum); if (threadIdx.x==0) out[blockIdx.x]=sum; } /********/ /* MAIN */ /********/ void main() { const int N = 200000; thrust::host_vector<float> h_out(N,0.f); thrust::device_vector<float> d_in(N,3.f); thrust::device_vector<float> d_out(N); int threads = 512; int blocks = min((N + threads - 1) / threads, 1024); float time; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // --- Performs the block reduction. It returns an output vector containig the block reductions as elements cudaEventRecord(start, 0); deviceReduceKernel<<<blocks, threads>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N); // --- Performs a second block reduction with only one block. The input is an array of all 0's, except the first elements which are the // block reduction results of the previous step. deviceReduceKernel<<<1, 1024>>>(thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_out.data()), blocks); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("CUDA Shuffle - elapsed time: %3.5f ms \n", time); h_out = d_out; cudaEventRecord(start, 0); float sum = thrust::reduce(d_in.begin(),d_in.end(),0.f,thrust::plus<float>()); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("CUDA Thrust - elapsed time: %3.5f ms \n", time); printf("Shuffle result = %f\n",h_out[0]); printf("Thrust result = %f\n",sum); getchar(); }

Randomly delayed instructions in CUDA

More articles: