I am learning how to optimize code for a GPU. I read about the importance of memory location. I also saw some tutorials and examples of convolving the GPU. Based on this, I wrote and tested several of my own cores. Surprisingly, I found that the simplest naive core is the fastest !? and it is less than 10 times faster than the processor. (Yes, I lost boot / load time by running kenrnel 64x).
What am I doing wrong? I expect the convolution to be just such an operation for which GPUs are optimized. If I can get 100x acceleration when multiplying the matrix , why is the convolution so slow?
performance [CPU ticks / pixel] (lower is better):- CPU-naive 9.5
- GPU-naive 1.64
- GPU-local 2.56
- GPU-local_async 15.10
- GPU-scanline-private 7.35
- GPU-scanline_async 15.37
EDIT: GPU-scanline_async. I did later by reading tips aboutasync_work_group_copy
Interesting 2 things:
- Is kernel speed limited by memory bandwidth or processing power? From what I read, I would expect a memory. But the test results show the opposite.
- The core of the GPU-local is slower than the GPU-naive, even if it makes much less global memory.
- (.. ) > 2x ,
- , , 100- GPU, CPU?
- GPU-scanline-private ? ( 3 9- ), ( ifs/)
Intel Core i7 6700HQ Skylake GPU nVidia 960M, 64x/frame 256x256 . code full .
=========== ===========
kernel GPU- 2D global = (256,256) local = (16,16)
__kernel void blur2D_naive(
__global float* I,
__global float* O
){
const int ix = get_global_id (0)+1;
const int iy = get_global_id (1)+1;
const int nx = get_global_size(0)+2;
int i = iy * nx + ix;
O[i] =( I[i-nx-1] + I[i-nx] + I[i-nx+1] +
I[i -1] + I[i ] + I[i +1] +
I[i+nx-1] + I[i+nx] + I[i+nx+1] ) * 0.11111111111;
}
kernel GPU-local 2D global = (256,256) local = (16,16)
#define NBx 18
#define NBy 18
__kernel void blur2D_local(
__global float* I,
__global float* O
){
__local float L[NBx*NBy];
const int2 iG = (int2)(get_global_id (0)+1 , get_global_id (1)+1 );
const int2 nG = (int2)(get_global_size(0)+2 , get_global_size(1)+2 );
const int2 iL = (int2)(get_local_id (0)+1 , get_local_id (1)+1 );
const int2 nL = (int2)(get_local_size (0)+2 , get_local_size (1)+2 );
const int2 iGR = (int2)(get_group_id (0) , get_group_id (1) );
switch( get_local_id(1) ){
case 4:
switch( get_local_id(0) ){
case 0: L[ 0 ] = I[ nG.x* get_group_id(1)*get_local_size(1) + get_group_id(0)*get_local_size(0) ]; break;
case 1: L[ NBx-1 ] = I[ nG.x* get_group_id(1)*get_local_size(1) + get_group_id(0)*get_local_size(0)+(NBx-1) ]; break;
case 2: L[ (NBy-1)*NBx ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1)) + get_group_id(0)*get_local_size(0) ]; break;
case 3: L[ NBy* NBx-1 ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1)) + get_group_id(0)*get_local_size(0)+(NBx-1) ]; break;
}
case 0: L[ iL.x ] = I[ nG.x* get_group_id(1)*get_local_size(1) + iG.x ]; break;
case 1: L[ NBx*(NBy-1) + iL.x ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1) ) + iG.x ]; break;
case 2: L[ NBx*iL.x ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+get_local_id(0) ) + get_group_id(0)*get_local_size(0) ]; break;
case 3: L[ NBx*iL.x + (NBx-1) ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+get_local_id(0) ) + (get_group_id(0)*get_local_size(0)+(NBx-1)) ]; break;
}
int ig = iG.y*nG.x + iG.x;
int il = iL.y*nL.x + iL.x;
L[il] = I[ig];
barrier(CLK_LOCAL_MEM_FENCE);
const float renorm = 1.0/9.0;
O[ig] =( L[il-NBx-1] + L[il-NBx] + L[il-NBx+1] +
L[il -1] + L[il ] + L[il +1] +
L[il+NBx-1] + L[il+NBx] + L[il+NBx+1] ) / 9.0;
}
GPU-local_async 2D global = (256,16) local = (16,16)
#define nTiles 16
#define NBx 18
#define NBy 18
#define copy_tile(event,ig0,I,L) { int ig_=ig0; int il_=0; for(int i=0; i<NBy; i++){ event = async_work_group_copy( L+il_, I+ig_, NBx, event ); ig_+=nx; il_+=NBx; } }
__kernel void blur2D_local_async(
__global float* I,
__global float* O
){
const int nx = get_global_size(0)+2;
__local float LI[NBx*NBy*2];
int iL0 = 0;
int iL1 = NBx*NBy;
event_t event = 0;
int ig0 = get_group_id(0)*get_local_size(0);
copy_tile(event,ig0,I,LI);
for( int it=0; it<nTiles; it++ ){
int ig = ig0 + (get_local_id(1)+1)*nx + get_local_id(0)+1;
int il = (get_local_id(1)+1)*NBx + get_local_id(0) + iL0;
ig0 += get_local_size(1)*nx;
event_t event_ = 0;
copy_tile(event_,ig0,I,LI+iL1);
wait_group_events(1, &event);
O[ig] =( LI[il-NBx] + LI[il-NBx+1] + LI[il-NBx+2] +
LI[il ] + LI[il +1] + LI[il +2] +
LI[il+NBx] + LI[il+NBx+1] + LI[il+NBx+2] ) * 0.11111111111;
int iLtmp=iL0; iL0=iL1; iL1=iLtmp;
event = event_;
}
}
GPU-scanline_private 1D global = (256) local = (32)
__kernel void blur2D_scanline_priv(
int nx, int ny,
__global float* I,
__global float* O
){
int ig = get_global_id(0)+1;
float3 Lm = (float3)( I[ig-1], I[ig], I[ig+1] ); ig += nx;
float3 L0 = (float3)( I[ig-1], I[ig], I[ig+1] );
for(int iy=1; iy<(ny-1); iy++ ){
ig += nx;
float3 Lp= (float3)( I[ig-1], I[ig], I[ig+1] );
O[ig-nx] =
( Lm.x + Lm.y + Lm.z +
L0.x + L0.y + L0.z +
Lp.x + Lp.y + Lp.z ) * 0.11111111111;
Lm=L0; L0=Lp;
}
}
GPU-scanline_async 1D global = (256) local = (32)
#define NB 34
__kernel void blur2D_scanline_async(
int nx, int ny,
__global float* I,
__global float* O
){
__local float L[NB*4];
int i0=0;
int i1=NB;
int i2=NB*2;
int i3=NB*3;
event_t event = 0;
int ig0 = get_group_id(0)*get_local_size(0);
event = async_work_group_copy( L , I+ig0, NB, event ); ig0 += nx;
event = async_work_group_copy( L+NB , I+ig0, NB, event ); ig0 += nx;
event = async_work_group_copy( L+NB*2, I+ig0, NB, event ); ig0 += nx;
const int il = get_local_id(0);
int ig = get_global_id(0)+1;
for(int iy=1; iy<(ny-2); iy++ ){
wait_group_events(1, &event);
event = async_work_group_copy( L+i3, I+ig0, NB, event ); ig0 += nx;
ig += nx;
O[ig] =
( L[i0+il] + L[i0+il+1] + L[i0+il+2] +
L[i1+il] + L[i1+il+1] + L[i1+il+2] +
L[i2+il] + L[i2+il+1] + L[i2+il+2] ) * 0.11111111111;
__local float *Ltmp;
int itmp=i0; i0=i1; i1=i2; i2=i3; i3=itmp;
}
}
kernel
void blur(int nx, int ny, float * I, float * O ){
float renorm = 1.0/9.0;
for(int iy=1;iy<ny-1;iy++){ for(int ix=1;ix<nx-1;ix++){
int i = iy*nx+ix;
O[i] =( I[i-nx-1] + I[i-nx] + I[i-nx+1] +
I[i -1] + I[i ] + I[i +1] +
I[i+nx-1] + I[i+nx] + I[i+nx+1] ) * renorm;
} }
}