Speed up CUDA atomization calculations for many bins / multiple bins

Question

Speed up CUDA atomization calculations for many bins / multiple bins

I am trying to optimize the calculation of histograms in CUDA. This gives me excellent acceleration compared to the corresponding OpenMP CPU calculation. However, I suspect (according to intuition) that most pixels fall into several buckets. For argument, suppose we have 256 pixels, say, two buckets.

The easiest way to do this is to do

Loading variables into shared memory
- We need vectorized loads for unsigned char, etc., if necessary.
Atomic addition to shared memory
Does collaborative recording merge into global.

Something like that:

__global__ void shmem_atomics_reducer(int *data, int *count){
  uint tid = blockIdx.x*blockDim.x + threadIdx.x;

  __shared__ int block_reduced[NUM_THREADS_PER_BLOCK];
  block_reduced[threadIdx.x] = 0;

  __syncthreads();

    atomicAdd(&block_reduced[data[tid]],1);
  __syncthreads();

  for(int i=threadIdx.x; i<NUM_BINS; i+=NUM_BINS)
    atomicAdd(&count[i],block_reduced[i]);

}

(), 45 / 32 10 / 1 . . , .

() parallelforall, warp, __ballot , __popc(), .

__global__ void ballot_popc_reducer(int *data, int *count ){
  uint tid = blockIdx.x*blockDim.x + threadIdx.x;
  uint warp_id = threadIdx.x >> 5;

  //need lane_ids since we are going warp level
  uint lane_id = threadIdx.x%32;

  //for ballot
  uint warp_set_bits=0;

   //to store warp level sum
  __shared__ uint warp_reduced_count[NUM_WARPS_PER_BLOCK];
   //shared data
  __shared__ uint s_data[NUM_THREADS_PER_BLOCK];

 //load shared data - could store to registers
  s_data[threadIdx.x] = data[tid];

  __syncthreads();


//suspicious loop - I think we need more parallelism
  for(int i=0; i<NUM_BINS; i++){
      warp_set_bits = __ballot(s_data[threadIdx.x]==i);

      if(lane_id==0){
        warp_reduced_count[warp_id] = __popc(warp_set_bits);
      }

     __syncthreads();

      //do warp level reduce 
      //could use shfl, but it does not change the overall picture
      if(warp_id==0){
        int t = threadIdx.x;
        for(int j = NUM_WARPS_PER_BLOCK/2; j>0; j>>=1){
          if(t<j) warp_reduced_count[t] += warp_reduced_count[t+j];
          __syncthreads();
        }
      }                                                                                                                                                                                                                                                                



      __syncthreads();


      if(threadIdx.x==0){
        atomicAdd(&count[i],warp_reduced_count[0]);
        }  

    }                                                                                                                                                                                                                                             

  }

(, - .. 133 /, , , ) (35-40 / 1 , 10-15 / ), , . 32 , 5 /. , , NUM_BINS.

NUM_BINS, . , ( ) , . , , , , . , . , y- .

, , parallelism, . , , - .

- article

, , shmem ( 48 kB SM Maxwell).

, - ? , , , - . , , .

: , . (, pdf), , Parzen Windows Kernel Density Estimation. , , , . . : Parzen

+4

optimization cuda binning histogram gpu-atomics

kakrafoon 17 . '16 5:49

1

Thiago Conrado · Answer 1 · 2017-08-22T21:17:21+0000

chalanges , botton . , . - , .

1- , warp, . , ( ) . " " Nsight. , , , ( Warp).

95%, , ( 90% .

, . , . , , . , , warp/block , .

, , , .

- , , :

__global__ hist(int4 *data, int *count, int N, int rem, unsigned int init) {

__shared__ unsigned int sBins[N_OF_BINS]; // you may want to declare this one dinamically
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIdx.x < N_OF_BINS) sBins[threadIdx.x] = 0; 

for (int i = 0; i < N; i+= warpSize) {
    atomicAdd(&sBins[data[i + init].w], 1);
    atomicAdd(&sBins[data[i + init].x], 1);
    atomicAdd(&sBins[data[i + init].y], 1);
    atomicAdd(&sBins[data[i + init].z], 1);
}

//process remaining elements if the data is not multiple of 4
// using recast and a additional control
for (int i = 0; i < rem; i++) {
    atomicAdd(&sBins[reinterpret_cast<int*>(data)[N * 4 + init + i]], 1);
} 
//update your histogram data here
}

Speed ​​up CUDA atomization calculations for many bins / multiple bins

More articles:

Speed up CUDA atomization calculations for many bins / multiple bins