I program in OpenCL using the GeForce GT 610 on Linux. Dual precision results CPU and GPU incompatible. I can post some of the code here, but first I would like to know if I came across this other. The difference between the double precision results of the GPU and the CPU becomes apparent when I run loops with a lot of iterations. There is nothing special about the code, but I can post it here if anyone is interested. Thank you very much. Here is my code. Sorry __ and poor formatting as I'm new to here. As you can see, I have two loops, and my processor code is essentially almost identical.
#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #else #error "Double precision floating point not supported by OpenCL implementation."
#endif
__kernel void simpar(__global double* fp, __global double* fp1, __global double* fp3, __global double* fp5, __global double* fp6, __global double* fp7, __global double* fp8, __global double* fp8Plus, __global double* x, __global double* v, __global double* acc, __global double* keBuf, __global double* peBuf, unsigned int prntstps, unsigned int nprntstps, double dt ) { unsigned int m,i,j,k,l,t; unsigned int chainlngth=100; double dxi, twodxi, dxipl1, dximn1, fac, fac1, fac2, fac13, fac23; double ke,pe,tke,tpe,te,dx; double hdt, hdt2; double alpha=0.16; double beta=0.7; double cmass; double peTemp; nprntstps=1001; dt=0.01; prntstps=100; double alphaby4=beta/4.0; hdt=0.5*dt; hdt2=dt*0.5*dt; double Xlocal,Vlocal,Acclocal; unsigned int global_id=get_global_id(0); if (global_id<chainlngth){ Xlocal=x[global_id]; Vlocal=v[global_id]; Acclocal=acc[global_id]; for (m=0;m<nprntstps;m++){ for(l=0;l<prntstps;l++){ Xlocal =Xlocal+dt *Vlocal+hdt2*Acclocal; x[global_id]=Xlocal; barrier(CLK_LOCAL_MEM_FENCE); Vlocal =Vlocal+ hdt * Acclocal; barrier(CLK_LOCAL_MEM_FENCE); j = global_id - 1; k = global_id + 1; if (j == -1) { dximn1 = 0.0; } else { dximn1 = x[j]; } if (k == chainlngth) { dxipl1 = 0.0; } else { dxipl1 = x[k]; } dxi = Xlocal; twodxi = 2.0 * dxi; fac = dxipl1 + dximn1 - twodxi; fac1 = dxipl1 - dxi; fac2 = dxi - dximn1; fac13 = fac1 * fac1 * fac1; fac23 = fac2 * fac2 * fac2; Acclocal = alpha * fac + beta * (fac13 - fac23); barrier(CLK_GLOBAL_MEM_FENCE); Vlocal += hdt * Acclocal; v[global_id]=Vlocal; acc[global_id]=Acclocal; barrier(CLK_GLOBAL_MEM_FENCE); } barrier(CLK_GLOBAL_MEM_FENCE); tke = tpe = te = dx = 0.0; ke=0.5*Vlocal*Vlocal;
}