I am working on implementing a hierarchical clustering algorithm in opencl. For each step, I find the minimum value in a very large array (approximately 10 ^ 8 entries), so that I know which elements need to be merged into a new cluster. Identification of the minimum value must be performed 9999 times. With my current kernels, it takes about 200 seconds to find the minimum value (accumulated over all iterations). As I approached the problem, dividing the array into 2560 identical fragments (on my Radeon 7970 there are 2560 stream processors) and find the minimum of each fragment separately. I am launching a second core that combines these lows into a global minimum.
Is this an even more effective way to approach this problem? The initial idea was to speed up HCA using OpenCL, but with the amount of time it takes to identify the minimum value, much more than the HCA Matlab on the CPU. What am I doing wrong?
__kernel void findMinValue(__global float * myArray, __global double * mins, __global int * elementsToWorkOn, __global int * arraysize){
int gid = get_global_id(0);
int minloc = 0;
float mymin = INFINITY;
int eltoWorkOn = *elementsToWorkOn;
int offset = gid*eltoWorkOn;
int target = offset + eltoWorkOn;
if (offset<*arraysize){
if (target > *arraysize){
target = *arraysize;
}
for (int i = offset; i < target; i++){
if (*(myArray + i) < mymin){
mymin = *(myArray + i);
minloc = i;
}
}
}
*(mins + gid * 2) = minloc;
*(mins + gid * 2 + 1) = mymin;
}
__kernel void getGlobalMin(__global double * mins, __global double * gmin, __global int * pixelsInImage){
int nWorkitems = 2560;
float globalMin = INFINITY;
double globalMinLoc;
float tempMin;
for (int i = 0; i < nWorkitems; i++){
tempMin = *(mins + 2 * i + 1);
if (tempMin < globalMin){
globalMin = tempMin;
globalMinLoc = *(mins + 2 * i);
}
}
*(gmin + 0) = globalMinLoc;
*(gmin + 1) = globalMin;
}
UPDATE
I redesigned the findMinValue core based on your suggestions. Access to memory is now coalescent, and I divided the work into working groups so that I could reduce the amount of access to global memory. Previously, each core wrote a minimum value to the global mins buffer. Now, only one core per worg group writes one value (i.e. the minimum size of the group). In addition, I increased the global size of the work to hide the memory latency.
, 200 59 ! !
- , ? ? , setArg(). int (: err = clSetKernelArg(kernel[2], 3, sizeof(int), &variable);). wawl ?
:
__kernel void findMinValue(__global float * myArray, __global double * mins, __global int * arraysize,__global int * elToWorkOn,__global int * dummy){
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupID = get_group_id(0);
int lsize = get_local_size(0);
int gsize = get_global_id(0);
int minloc = 0;
int arrSize = *arraysize;
int elPerGroup = *elToWorkOn;
float mymin = INFINITY;
__local float lmins[128];
*(lmins + lid) = INFINITY;
__local int lminlocs[128];
float temp;
int offset = elPerGroup*groupID + lid;
int target = elPerGroup*(groupID + 1);
target = min(arrSize, target);
if (offset < arrSize){
for (int i = offset; i < target; i += lsize){
temp = *(myArray + i);
if (temp < mymin){
mymin = temp;
minloc = i;
}
}
*(lminlocs + lid) = minloc;
*(lmins + lid) = mymin;
lsize = lsize >> 1;
while (lsize > 0){
if (lid < lsize){
if (*(lmins + lid)> *(lmins + lid + lsize)){
*(lmins + lid) = *(lmins + lid + lsize);
*(lminlocs + lid) = *(lminlocs + lid + lsize);
}
}
lsize = lsize >> 1;
}
}
if (lid == 0){
*(mins + groupID * 2 + 0) = *(lminlocs + 0);
*(mins + groupID * 2 + 1) = *(lmins + 0);
}
}