Improvement of the OpenCL core for the Perceptron neural network

I have done a lot of OpenGL and shaders before, and now I decided to try OpenCL. I watched some online tutorials and started reading books on this subject. In order to better understand, and because I believe that the best way to learn is to reasonably try and learn from the problems that have arisen, I decided to start introducing a kernel for a fully connected perceptron.

For those who do not know what it is, I will explain the basic idea. This is a neural network in which each neuron of a layer is connected to each neuron of the next layer. Each neuron has only one action to perform: performing the sum of all neurons of the previous layer, weighted by a different value for each neuron.

It seemed simple enough to implement, and after reading the article “Parallel training of a neural network using OpenCL”, I implemented it as follows.

  • Each level depends on the previous one, they are launched sequentially by the host

  • To calculate the level, I launch my kernel with the global work size of the number of neurons inside the layer (which can be quite huge, for example, tens of thousands). This makes all neurons carry out their sum independently of each other.

  • Each neuron (identified by its global_work_id) performs a weighted sum with all neurons of the previous layer.

Here is my fully functional opencl core:

/**
* @brief Computes one layer of the perceptron given the previous one and the
* weights
* The kernel is run once for each layer.
* The work items are each tasked with computing the output of a single neuron
* of the out layer.
*
* @param out_layer_size
*   Size of the output layer (number of elements in the output array that will
*   contain the result for each neuron).
* @param in_layer_size
*   Number of elements of the input layer
* @param in_value
*   Values of the neuron in the previous layer
* @param in_weights
*   Array containing the weights for each input neuron. It is organised as a
*   two dimensional matrix, written by concatenating each line in the array
*   [ w11, w12, w13, ...
*     w21, w22, w23, ...
*     ..., ..., ..., ...
*   ]
*   Where wij is the weight linking the neuron i of the input layer to the
*   neuron j of the output layer
* @param out_values
*   Computed values for the current layer
*/
void kernel perceptron(global const int* in_layer_size, global const int* out_layer_size, global const float *in_value, global const float* in_weights, global float* out_values)
{
    private const int global_id = get_global_id(0);
    private const int out_layer_s = *out_layer_size;
    private const int in_layer_s = *in_layer_size;
    private const int offset = out_layer_s * global_id;

    private float sum = 0.;
    for(int i=0; i < in_layer_s; i++) {
        sum += in_weights[i*out_layer_s+global_id] * in_value[i];
    }
    //out_values[global_id] = sigma(sum);
    out_values[global_id] = sum;
}

And this is how I call it:

queue.enqueueNDRangeKernel(kernel, cl::NullRange,cl::NDRange(number of neurons within layer),cl::NullRange);

, . , - , , .

, , , , , , .

, ( Nvidia GTX 660M), , . :

  • 2500, 10 000, 2500: 0.018s ~ 60FPS. 4-5 , (Intel Core i7 2,40 )

  • 100 000, 100 000, 500: 140 → , , , , 100 000 . .

+4
3

, . , WI ( ) - . , GPU.

, , WI ( ). WI . , , WI WG , , . . :

void kernel Kernel(
__global const int  in_layer_size, 
__global const int  out_layer_size, 
__global const float    *in_value, 
__global const float    *in_weights, 
__global float      *out_values){

__local float buffer[SOME_SIZE];
__global const float* p_in  = in_value;
__global float* p_out = out_values;

const int 
    global_id   = get_global_id(0),
    local_id    = get_local_id(0),
    num_buffers = in_layer_size / SOME_SIZE,
    offset      = out_layer_size * global_id;

float sum = 0.0f;
for(int i=0; i < num_buffers; i++){
    buffer[local_id] = p_in[local_id];
    barrier(CLK_LOCAL_MEM_FENCE);

    //Process all data inside buffer by every WI in WG
    //...

    p_in += SOME_SIZE;
    out_values += SOME_SIZE;
    }

//...
return;

}

, , . Al , . .

+1

. , , ( , ) .

- :

. . OpenCL , . , clSetKernelArg(kernel, arg_index, sizeof(cl_float), &size); cl_float size = the_size;.

#define IN_LOCAL_SIZE 4096 //Because 16KB/4B (for each float)

void kernel perceptron(global const int in_layer_size, global const int out_layer_size, global const float *in_value, global const float* in_weights, global float* out_values)
{
    const int global_id = get_global_id(0);
    __local float in_buffer[IN_LOCAL_SIZE];

    float sum = 0.0f;
    event_t ev;
    int j;
    //For each full buffer
    for(j=0; j < (in_layer_size/IN_LOCAL_SIZE)-1; i++) {
        ev = async_work_group_copy(in_buffer, in_value+j*IN_LOCAL_SIZE, IN_LOCAL_SIZE, ev);
        wait_group_events(1,&ev);
        barrier(CLK_LOCAL_MEM_FENCE);
        for(int i=0; i < IN_LOCAL_SIZE; i++) {
            sum += in_weights[(i+j*IN_LOCAL_SIZE)*out_layer_size+global_id] * in_buffer[i];
        }
    }
    //Last one
    ev = async_work_group_copy(in_buffer, in_value+j*IN_LOCAL_SIZE, in_layer_size%IN_LOCAL_SIZE, ev);
    wait_group_events(1,&ev);
    barrier(CLK_LOCAL_MEM_FENCE);
    for(int i=0; i < in_layer_size%IN_LOCAL_SIZE; i++) {
        sum += in_weights[(i+j*IN_LOCAL_SIZE)*out_layer_size+global_id] * in_buffer[i];
    }
    out_values[global_id] = sum;
}

, (100k, 250k, 500), 500 , . .

- , , . , .

, , .

+1

, in_values ​​ . in_values ​​ , .

, . in_values ​​ , out_values ​​ . , .

:

  output elements assumed to be set to 0 already
  for each block of input values:
    cache the input block
    for each target output value:
      reset local sum to 0
      for each element this work item is responsible for:
        read the weight, multiply, and add to sum
      reduce sums to a single value, ADD value to output element

I have not yet had the opportunity to run this through a profiler or debugger, but I will try when I get back to my home computer. (without opencl tools on my office workstation). Make sure that the queue core with the group size is equal to the GROUP_SIZE constant. In addition, create only one group per billing unit on your device.

valid code:

//experiment with GROUP_SIZE to discover the optimal value for your device
//this needs to be equal to local_work_size passed into clEnqueueNDRangeKernel
//use a multiple of CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
//max. for most devices is 256
#define GROUP_SIZE = 64;

// IN_VALUE_CACHE_SIZE is the number of floats from in_value to copy to local memory at a time
//assuming GROUP_SIZE can be up to 256, sizeof(float)=4,  and local memory size is 32kb, full saturation can be achieved with the following:
//(32768 - (256 * 4)) /4 = 7936
//try another multiple of 1024 (6144, 4096... )if there is trouble with this value
#define IN_VALUE_CACHE_SIZE = 7936;

void kernel perceptron(global const int* in_layer_size, global const int* out_layer_size, global const float *in_value, global const float* in_weights, global float* out_values)
{
    private const int global_id = get_global_id(0);
    private const int out_layer_s = *out_layer_size;
    private const int in_layer_s = *in_layer_size;
    private const int offset = out_layer_s * global_id;

    private const int item_id = get_local_id(0);    
    private const int group_id = get_group_id(0);   
    private const int group_count = get_num_groups(0);  


    local float result_buffer[GROUP_SIZE];

    local float in_value_cache[IN_VALUE_CACHE_SIZE];
    int i,j,k;

    //init the block to 0, in case there are fewer than IN_VALUE_CACHE_SIZE values in total
    for(i=item_id; i<IN_VALUE_CACHE_SIZE; i+= GROUP_SIZE){
        in_value_cache[i] = 0.0;
    }
    barrier(CL_LOCAL_MEM_FENCE);


    private float sum = 0.0;
    event_t e;
    int copy_total = 0;
    int copy_offset;

    for(i=0; i<in_layer_s; i+=IN_VALUE_CACHE_SIZE){
        //cap the number of values to copy to local memory if loop is near the end of the input data
        copy_total = IN_VALUE_CACHE_SIZE;
        if((copy_total + i*IN_VALUE_CACHE_SIZE) > in_layer_s){
            copy_total = in_layer_s - i*IN_VALUE_CACHE_SIZE;
        }           
        //copy the next block of values
        e = async_work_group_copy(in_value_cache, in_value + i * 4, copy_total, 0);
        wait_group_events(1, &e);

        for(j=group_id; j<out_layer_s; j+=group_count){
            sum = 0.0;

            //need to reset result_buffer[item_id] as well
            //this is in case there are fewer than GROUP_SIZE input values remaining  ie copy_total < GROUP_SIZE
            result_buffer[item_id] = 0.0;

            for(k=item_id; k<copy_total; k+=GROUP_SIZE){
                sum += in_value_cache[k] * in_weights[(k+i) + j * out_layer_s];
            }
            result_buffer[item_id] = sum;

            //simple O(n) reduction can be optimized further
            if(item_id == 0){
                for(k=1;k<GROUP_SIZE;k++){
                    sum += result_buffer[k];
                }
                out_values[j] += sum;
            }
            barrier(CL_LOCAL_MEM_FENCE);
        }

    }
}

This will handle input of any size, so you can try it with as many elements as you have global memory.

+1
source

Source: https://habr.com/ru/post/1525968/


All Articles