Speed ​​up CUDA atomization calculations for many bins / multiple bins

I am trying to optimize the calculation of histograms in CUDA. This gives me excellent acceleration compared to the corresponding OpenMP CPU calculation. However, I suspect (according to intuition) that most pixels fall into several buckets. For argument, suppose we have 256 pixels, say, two buckets.

The easiest way to do this is to do

  • Loading variables into shared memory
    • We need vectorized loads for unsigned char, etc., if necessary.
  • Atomic addition to shared memory
  • Does collaborative recording merge into global.

Something like that:

__global__ void shmem_atomics_reducer(int *data, int *count){
  uint tid = blockIdx.x*blockDim.x + threadIdx.x;

  __shared__ int block_reduced[NUM_THREADS_PER_BLOCK];
  block_reduced[threadIdx.x] = 0;

  __syncthreads();

    atomicAdd(&block_reduced[data[tid]],1);
  __syncthreads();

  for(int i=threadIdx.x; i<NUM_BINS; i+=NUM_BINS)
    atomicAdd(&count[i],block_reduced[i]);

}

(), 45 / 32 10 / 1 . . , .

() parallelforall, warp, __ballot , __popc(), .

__global__ void ballot_popc_reducer(int *data, int *count ){
  uint tid = blockIdx.x*blockDim.x + threadIdx.x;
  uint warp_id = threadIdx.x >> 5;

  //need lane_ids since we are going warp level
  uint lane_id = threadIdx.x%32;

  //for ballot
  uint warp_set_bits=0;

   //to store warp level sum
  __shared__ uint warp_reduced_count[NUM_WARPS_PER_BLOCK];
   //shared data
  __shared__ uint s_data[NUM_THREADS_PER_BLOCK];

 //load shared data - could store to registers
  s_data[threadIdx.x] = data[tid];

  __syncthreads();


//suspicious loop - I think we need more parallelism
  for(int i=0; i<NUM_BINS; i++){
      warp_set_bits = __ballot(s_data[threadIdx.x]==i);

      if(lane_id==0){
        warp_reduced_count[warp_id] = __popc(warp_set_bits);
      }

     __syncthreads();

      //do warp level reduce 
      //could use shfl, but it does not change the overall picture
      if(warp_id==0){
        int t = threadIdx.x;
        for(int j = NUM_WARPS_PER_BLOCK/2; j>0; j>>=1){
          if(t<j) warp_reduced_count[t] += warp_reduced_count[t+j];
          __syncthreads();
        }
      }                                                                                                                                                                                                                                                                



      __syncthreads();


      if(threadIdx.x==0){
        atomicAdd(&count[i],warp_reduced_count[0]);
        }  

    }                                                                                                                                                                                                                                             

  }

(, - .. 133 /, , , ) (35-40 / 1 , 10-15 / ), , . 32 , 5 /. , , NUM_BINS.

NUM_BINS, . , ( ) , . , , , , . , . , y- .

, , parallelism, ​​ . , , - .

- article

, , shmem ( 48 kB SM Maxwell).

, - ? , , , - . , , .

: , . (, pdf), , Parzen Windows Kernel Density Estimation. , , , . . : Parzen

+4
1

chalanges , botton . , . - , .

1- , warp, . , ( ) . " " Nsight. , , , ( Warp).

Many cycles without an elegant base 95%, , ( 90% . enter image description here

, . , . , , . , , warp/block , .

enter image description here

, , , .

- , , :

__global__ hist(int4 *data, int *count, int N, int rem, unsigned int init) {

__shared__ unsigned int sBins[N_OF_BINS]; // you may want to declare this one dinamically
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIdx.x < N_OF_BINS) sBins[threadIdx.x] = 0; 

for (int i = 0; i < N; i+= warpSize) {
    atomicAdd(&sBins[data[i + init].w], 1);
    atomicAdd(&sBins[data[i + init].x], 1);
    atomicAdd(&sBins[data[i + init].y], 1);
    atomicAdd(&sBins[data[i + init].z], 1);
}

//process remaining elements if the data is not multiple of 4
// using recast and a additional control
for (int i = 0; i < rem; i++) {
    atomicAdd(&sBins[reinterpret_cast<int*>(data)[N * 4 + init + i]], 1);
} 
//update your histogram data here
}
+1

Source: https://habr.com/ru/post/1654890/


All Articles