Effectively find the minimum of a large array using Opencl

I am working on implementing a hierarchical clustering algorithm in opencl. For each step, I find the minimum value in a very large array (approximately 10 ^ 8 entries), so that I know which elements need to be merged into a new cluster. Identification of the minimum value must be performed 9999 times. With my current kernels, it takes about 200 seconds to find the minimum value (accumulated over all iterations). As I approached the problem, dividing the array into 2560 identical fragments (on my Radeon 7970 there are 2560 stream processors) and find the minimum of each fragment separately. I am launching a second core that combines these lows into a global minimum.

Is this an even more effective way to approach this problem? The initial idea was to speed up HCA using OpenCL, but with the amount of time it takes to identify the minimum value, much more than the HCA Matlab on the CPU. What am I doing wrong?

__kernel void findMinValue(__global float * myArray, __global double * mins, __global int * elementsToWorkOn, __global int * arraysize){
int gid = get_global_id(0);
int minloc = 0;
float mymin = INFINITY;
int eltoWorkOn = *elementsToWorkOn;
int offset = gid*eltoWorkOn;
int target = offset + eltoWorkOn;

if (offset<*arraysize){
    //make sure the array size is not exceeded
    if (target > *arraysize){
        target = *arraysize;
    }

    //find minimum for the kernel
    for (int i = offset; i < target; i++){
        if (*(myArray + i) < mymin){
            mymin = *(myArray + i);
            minloc = i;
        }
    }
}
*(mins + gid * 2) = minloc;
*(mins + gid * 2 + 1) = mymin;
}


__kernel void getGlobalMin(__global double * mins, __global double * gmin, __global int * pixelsInImage){
    int nWorkitems = 2560;
    float globalMin = INFINITY;
    double globalMinLoc;
    float tempMin;
    for (int i = 0; i < nWorkitems; i++){
        tempMin = *(mins + 2 * i + 1);
        if (tempMin < globalMin){
            globalMin = tempMin;
            globalMinLoc = *(mins + 2 * i);
        }
    }
    *(gmin + 0) = globalMinLoc;
    *(gmin + 1) = globalMin;
}

UPDATE

I redesigned the findMinValue core based on your suggestions. Access to memory is now coalescent, and I divided the work into working groups so that I could reduce the amount of access to global memory. Previously, each core wrote a minimum value to the global mins buffer. Now, only one core per worg group writes one value (i.e. the minimum size of the group). In addition, I increased the global size of the work to hide the memory latency.

, 200 59 ! !

- , ? ? , setArg(). int (: err = clSetKernelArg(kernel[2], 3, sizeof(int), &variable);). wawl ?

:

__kernel void findMinValue(__global float * myArray, __global double * mins, __global int * arraysize,__global int * elToWorkOn,__global int * dummy){
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupID = get_group_id(0);
int lsize = get_local_size(0);
int gsize = get_global_id(0);
int minloc = 0;
int arrSize = *arraysize;
int elPerGroup = *elToWorkOn;
float mymin = INFINITY;


__local float lmins[128];
//initialize local memory
*(lmins + lid) = INFINITY;
__local int lminlocs[128];

//this private value will reduce global memory access in the for loop (temp = *(myArray + i);)
float temp;

//ofset and target of the for loop
int offset = elPerGroup*groupID + lid;
int target = elPerGroup*(groupID + 1);

//prevent that target<arrsize (may happen due to rounding errors or arrSize not a multiple of elPerGroup
target = min(arrSize, target);

//find minimum for the kernel
//offset is different for each lid, leading to sequential memory access
if (offset < arrSize){
    for (int i = offset; i < target; i += lsize){
        temp = *(myArray + i);
        if (temp < mymin){
            mymin = temp;
            minloc = i;
        }
    }

    //store kernel minimum in local memory
    *(lminlocs + lid) = minloc;
    *(lmins + lid) = mymin;

    //find work group minimum (reduce global memory accesses)
    lsize = lsize >> 1;
    while (lsize > 0){
        if (lid < lsize){
            if (*(lmins + lid)> *(lmins + lid + lsize)){
                *(lmins + lid) = *(lmins + lid + lsize);
                *(lminlocs + lid) = *(lminlocs + lid + lsize);
            }
        }
        lsize = lsize >> 1;
    }
}
//write group minimum to global buffer
if (lid == 0){
    *(mins + groupID * 2 + 0) = *(lminlocs + 0);
    *(mins + groupID * 2 + 1) = *(lmins + 0);
}
}
+4
3

, WI. , , . setArg() ints, . , :

__kernel void findMinValue(__global float * myArray, __global double * mins, __global int arraysize){
    int gid = get_global_id(0);
    int minloc = 0;
    float mymin = INFINITY;

    //find minimum for the kernel
    for (int i = gid ; i < arraysize; i+= get_global_size(0)){
        if (*(myArray + i) < mymin){
            mymin = *(myArray + i);
            minloc = i;
        }
    }

    *(mins + gid * 2) = minloc;
    *(mins + gid * 2 + 1) = mymin;
}
+1

, ZERO . , , .

+1

4%. . .

, (+ ) . , , . , 22500*22500, 22500, 506250000. , , , mimima 200 ( ) 59 () 8 s.

, - : -)

0

Source: https://habr.com/ru/post/1544928/


All Articles