OpenCL dual precision differs from dual precision CPU

I program in OpenCL using the GeForce GT 610 on Linux. Dual precision results CPU and GPU incompatible. I can post some of the code here, but first I would like to know if I came across this other. The difference between the double precision results of the GPU and the CPU becomes apparent when I run loops with a lot of iterations. There is nothing special about the code, but I can post it here if anyone is interested. Thank you very much. Here is my code. Sorry __ and poor formatting as I'm new to here. As you can see, I have two loops, and my processor code is essentially almost identical.

#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #else #error "Double precision floating point not supported by OpenCL implementation." 

#endif

 __kernel void simpar(__global double* fp, __global double* fp1, __global double* fp3, __global double* fp5, __global double* fp6, __global double* fp7, __global double* fp8, __global double* fp8Plus, __global double* x, __global double* v, __global double* acc, __global double* keBuf, __global double* peBuf, unsigned int prntstps, unsigned int nprntstps, double dt ) { unsigned int m,i,j,k,l,t; unsigned int chainlngth=100; double dxi, twodxi, dxipl1, dximn1, fac, fac1, fac2, fac13, fac23; double ke,pe,tke,tpe,te,dx; double hdt, hdt2; double alpha=0.16; double beta=0.7; double cmass; double peTemp; nprntstps=1001; dt=0.01; prntstps=100; double alphaby4=beta/4.0; hdt=0.5*dt; hdt2=dt*0.5*dt; double Xlocal,Vlocal,Acclocal; unsigned int global_id=get_global_id(0); if (global_id<chainlngth){ Xlocal=x[global_id]; Vlocal=v[global_id]; Acclocal=acc[global_id]; for (m=0;m<nprntstps;m++){ for(l=0;l<prntstps;l++){ Xlocal =Xlocal+dt *Vlocal+hdt2*Acclocal; x[global_id]=Xlocal; barrier(CLK_LOCAL_MEM_FENCE); Vlocal =Vlocal+ hdt * Acclocal; barrier(CLK_LOCAL_MEM_FENCE); j = global_id - 1; k = global_id + 1; if (j == -1) { dximn1 = 0.0; } else { dximn1 = x[j]; } if (k == chainlngth) { dxipl1 = 0.0; } else { dxipl1 = x[k]; } dxi = Xlocal; twodxi = 2.0 * dxi; fac = dxipl1 + dximn1 - twodxi; fac1 = dxipl1 - dxi; fac2 = dxi - dximn1; fac13 = fac1 * fac1 * fac1; fac23 = fac2 * fac2 * fac2; Acclocal = alpha * fac + beta * (fac13 - fac23); barrier(CLK_GLOBAL_MEM_FENCE); Vlocal += hdt * Acclocal; v[global_id]=Vlocal; acc[global_id]=Acclocal; barrier(CLK_GLOBAL_MEM_FENCE); } barrier(CLK_GLOBAL_MEM_FENCE); tke = tpe = te = dx = 0.0; ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal; barrier(CLK_GLOBAL_MEM_FENCE); fp6[(m*100)+global_id]=ke; keBuf[global_id]=ke; ke=0.0; barrier(CLK_GLOBAL_MEM_FENCE); j = global_id - 1; k = global_id + 1; if (j == -1) { dximn1 = 0.0; } else { dximn1 = x[j]; } if (k == chainlngth) { dxipl1 = 0.0; } else { dxipl1 = x[k]; } dxi = Xlocal; twodxi = 2.0 * dxi; fac = dxipl1 + dximn1 - twodxi; fac1 = dxipl1 - dxi; fac2 = dxi - dximn1; fac13 = fac1 * fac1 * fac1; fac23 = fac2 * fac2 * fac2; Acclocal = alpha * fac + beta * (fac13 - fac23); barrier(CLK_GLOBAL_MEM_FENCE); Vlocal += hdt * Acclocal; v[global_id]=Vlocal; acc[global_id]=Acclocal; barrier(CLK_GLOBAL_MEM_FENCE); } barrier(CLK_GLOBAL_MEM_FENCE); tke = tpe = te = dx = 0.0; ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal; barrier(CLK_GLOBAL_MEM_FENCE); fp6[(m*100)+global_id]=ke; keBuf[global_id]=ke; ke=0.0; barrier(CLK_GLOBAL_MEM_FENCE); j = global_id - 1; k = global_id + 1; if (j == -1) { dximn1 = 0.0; } else { dximn1 = x[j]; } if (k == chainlngth) { dxipl1 = 0.0; } else { dxipl1 = x[k]; } dxi = Xlocal; twodxi = 2.0 * dxi; fac = dxipl1 + dximn1 - twodxi; fac1 = dxipl1 - dxi; fac2 = dxi - dximn1; fac13 = fac1 * fac1 * fac1; fac23 = fac2 * fac2 * fac2; Acclocal = alpha * fac + beta * (fac13 - fac23); barrier(CLK_GLOBAL_MEM_FENCE); Vlocal += hdt * Acclocal; v[global_id]=Vlocal; acc[global_id]=Acclocal; barrier(CLK_GLOBAL_MEM_FENCE); } barrier(CLK_GLOBAL_MEM_FENCE); tke = tpe = te = dx = 0.0; ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal; barrier(CLK_GLOBAL_MEM_FENCE); fp6[(m*100)+global_id]=ke; keBuf[global_id]=ke; ke=0.0; barrier(CLK_GLOBAL_MEM_FENCE); if (global_id ==0){ for(t=0;t<100;t++) tke+=keBuf[t]; } barrier(CLK_GLOBAL_MEM_FENCE); k = global_id-1; if (k == -1) { dx = Xlocal; }else{ dx = Xlocal-x[k]; } fac = dx * dx; peTemp = alpha * 0.5 * fac + alphaby4 * fac * fac; fp8[global_id*m]=peTemp; if (global_id == 0) tpe+=peTemp; barrier(CLK_GLOBAL_MEM_FENCE); cmass=0.0; dx = -x[100-1]; fac = dx*dx; pe=alpha*0.5*fac+alphaby4*fac*fac; if (global_id==0){ fp8Plus[m]=pe; tpe+=peBuf[0]; fp5[m*2]=i; fp5[m*2+1]=cmass; te=tke+tpe; fp[m*2]=m; fp[m*2+1]=te; } barrier(CLK_GLOBAL_MEM_FENCE); //cmass /=100; fp1[(m*chainlngth)+global_id]=Xlocal-cmass; // barrier(CLK_GLOBAL_MEM_FENCE); fp3[(m*chainlngth)+global_id]=Vlocal; // barrier(CLK_GLOBAL_MEM_FENCE); fp7[(m*chainlngth)+global_id]=Acclocal; barrier(CLK_GLOBAL_MEM_FENCE); } } 

}

+6
source share
1 answer

This is actually somewhat expected behavior.

On older x86 processors, the floating point number is 80 bits (Intel "long double" ) and truncated to 64 bits only when necessary. When the SIMD modules / instructions for floating point arithmetic arrived for x86 processors, double floating point precision became 64-bit by default; however, 80 bits are still possible, depending on your compiler settings. There is something to read about it: Wikipedia: floating point .

Check the compiler settings for OpenCL code and the host on floating point magic tricks to better match your results. Calculate the absolute and relative error of your values ​​and check if this error field is safe for your application.

+5
source

Source: https://habr.com/ru/post/951033/


All Articles