I am trying to optimize the calculation of histograms in CUDA. This gives me excellent acceleration compared to the corresponding OpenMP CPU calculation. However, I suspect (according to intuition) that most pixels fall into several buckets. For argument, suppose we have 256 pixels, say, two buckets.
The easiest way to do this is to do
- Loading variables into shared memory
- We need vectorized loads for unsigned char, etc., if necessary.
- Atomic addition to shared memory
- Does collaborative recording merge into global.
Something like that:
__global__ void shmem_atomics_reducer(int *data, int *count){
uint tid = blockIdx.x*blockDim.x + threadIdx.x;
__shared__ int block_reduced[NUM_THREADS_PER_BLOCK];
block_reduced[threadIdx.x] = 0;
__syncthreads();
atomicAdd(&block_reduced[data[tid]],1);
__syncthreads();
for(int i=threadIdx.x; i<NUM_BINS; i+=NUM_BINS)
atomicAdd(&count[i],block_reduced[i]);
}
(), 45 / 32 10 / 1 . . , .
() parallelforall, warp, __ballot , __popc(), .
__global__ void ballot_popc_reducer(int *data, int *count ){
uint tid = blockIdx.x*blockDim.x + threadIdx.x;
uint warp_id = threadIdx.x >> 5;
uint lane_id = threadIdx.x%32;
uint warp_set_bits=0;
__shared__ uint warp_reduced_count[NUM_WARPS_PER_BLOCK];
__shared__ uint s_data[NUM_THREADS_PER_BLOCK];
s_data[threadIdx.x] = data[tid];
__syncthreads();
for(int i=0; i<NUM_BINS; i++){
warp_set_bits = __ballot(s_data[threadIdx.x]==i);
if(lane_id==0){
warp_reduced_count[warp_id] = __popc(warp_set_bits);
}
__syncthreads();
if(warp_id==0){
int t = threadIdx.x;
for(int j = NUM_WARPS_PER_BLOCK/2; j>0; j>>=1){
if(t<j) warp_reduced_count[t] += warp_reduced_count[t+j];
__syncthreads();
}
}
__syncthreads();
if(threadIdx.x==0){
atomicAdd(&count[i],warp_reduced_count[0]);
}
}
}
(, - .. 133 /, , , ) (35-40 / 1 , 10-15 / ), , . 32 , 5 /. , , NUM_BINS.
NUM_BINS, . , ( ) , . , , , , . , . , y- .
, , parallelism, . , , - .
- article
, , shmem ( 48 kB SM Maxwell).
, - ? , , , - . , , .
: , . (, pdf), , Parzen Windows Kernel Density Estimation. , , , . . : Parzen