Reducing the CUDA of many small arrays of uneven size

I am wondering if anyone can suggest a better approach to calculate the average / standard deviation of a large number of relatively small but different sized arrays in CUDA?

The example of parallel reduction in the SDK works on one very large array, and it seems that the size is convenient to multiply by the number of threads per block, but my case is different from the other:

Basically, I have a large number of objects, each of which contains two components: upperand lower, and each of these components has a coordinate xand y. i.e.

upper.x, lower.x, upper.y, lower.y

Each of these arrays has a length of approximately 800, but it varies between objects (not inside the object), for example.

Object1.lower.x = 1.1, 2.2, 3.3
Object1.lower.y = 4.4, 5.5, 6.6
Object1.upper.x = 7.7, 8.8, 9.9
Object1.upper.y = 1.1, 2.2, 3.3

Object2.lower.x = 1.0,  2.0,  3.0,  4.0, 5.0 
Object2.lower.y = 6.0,  7.0,  8.0,  9.0, 10.0
Object2.upper.x = 11.0, 12.0, 13.0, 14.0, 15.0 
Object2.upper.y = 16.0, 17.0, 18.0, 19.0, 20.0

, , C - : , . , , , , .

GPU ?. , , , , , , .

+3
2

, Object1.lower.x , Object1.lower.y . , ( ).

, . , .

, . , , , , h/w , warp - , , (, , ).

for (int i = 0 ; i < myarraylength ; i++)
    sum += myarray[i];

, , , , [1].

for (int i = tidwithinwarp ; i < warparraylength ; i += warpsize)
{
    mysum += warparray[i];
}
mysum = warpreduce(mysum);

, 64 , , 1,2 , , .

, .. 128 , .

[1] , , , , [0] [0] [1] [0], . , , , , , .

+1

, warp CUB.

:

#include <cub/cub.cuh>
#include <cuda.h>

#include "Utilities.cuh"

#include <iostream>

#define WARPSIZE    32
#define BLOCKSIZE   256

const int N = 1024;

/*************************/
/* WARP REDUCTION KERNEL */
/*************************/
__global__ void sum(const float * __restrict__ indata, float * __restrict__ outdata) {

    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

    unsigned int warp_id = threadIdx.x / WARPSIZE;

    // --- Specialize WarpReduce for type float. 
    typedef cub::WarpReduce<float, WARPSIZE> WarpReduce;

    // --- Allocate WarpReduce shared memory for (N / WARPSIZE) warps
    __shared__ typename WarpReduce::TempStorage temp_storage[BLOCKSIZE / WARPSIZE];

    float result;
    if(tid < N) result = WarpReduce(temp_storage[warp_id]).Sum(indata[tid]);

    if(tid % WARPSIZE == 0) outdata[tid / WARPSIZE] = result;
}

/********/
/* MAIN */
/********/
int main() {

    // --- Allocate host side space for 
    float *h_data       = (float *)malloc(N * sizeof(float));
    float *h_result     = (float *)malloc((N / WARPSIZE) * sizeof(float));

    float *d_data;      gpuErrchk(cudaMalloc(&d_data, N * sizeof(float)));
    float *d_result;    gpuErrchk(cudaMalloc(&d_result, (N / WARPSIZE) * sizeof(float)));

    for (int i = 0; i < N; i++) h_data[i] = (float)i;

    gpuErrchk(cudaMemcpy(d_data, h_data, N * sizeof(float), cudaMemcpyHostToDevice));

    sum<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_data, d_result);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_result, d_result, (N / WARPSIZE) * sizeof(float), cudaMemcpyDeviceToHost));

    std::cout << "output: ";
    for(int i = 0; i < (N / WARPSIZE); i++) std::cout << h_result[i] << " ";
    std::cout << std::endl;

    gpuErrchk(cudaFree(d_data));
    gpuErrchk(cudaFree(d_result));

    return 0;
}

N, 32 .

result[0] = data[0] + ... + data[31];
result[1] = data[32] + ... + data[63];
....
+1

Source: https://habr.com/ru/post/1723516/


All Articles