I cannot reproduce this using CUDA 3.2 and QT4 on a 64-bit Ubuntu 10.04LTS system. I took this main thing:
#include <QtCore/QCoreApplication> extern float cudamain(); int main(int argc, char *argv[]) { QCoreApplication a(argc, argv); float gflops = cudamain(); return 0; }
and a cudamain() containing this:
#include <assert.h> #define blocksize 16 #define HM (4096) #define WM (4096) #define WN (4096) #define HN WM #define WP WN #define HP HM #define PTH WM #define PTW HM __global__ void nonsquare(float*M, float*N, float*P, int uWM,int uWN) { __shared__ float MS[blocksize][blocksize]; __shared__ float NS[blocksize][blocksize]; int tx=threadIdx.x, ty=threadIdx.y, bx=blockIdx.x, by=blockIdx.y; int rowM=ty+by*blocksize; int colN=tx+bx*blocksize; float Pvalue=0; for(int m=0; m<uWM; m+=blocksize){ MS[ty][tx]=M[rowM*uWM+(m+tx)] ; NS[ty][tx]=M[colN + uWN*(m+ty)]; __syncthreads(); for(int k=0;k<blocksize;k++) Pvalue+=MS[ty][k]*NS[k][tx]; __syncthreads(); } P[rowM*WP+colN]=Pvalue; } inline void gpuerrorchk(cudaError_t state) { assert(state == cudaSuccess); } float cudamain(){ cudaEvent_t evstart, evstop; cudaEventCreate(&evstart); cudaEventCreate(&evstop); float*M=(float*)malloc(sizeof(float)*HM*WM); float*N=(float*)malloc(sizeof(float)*HN*WN); for(int i=0;i<WM*HM;i++) M[i]=(float)i; for(int i=0;i<WN*HN;i++) N[i]=(float)i; float*P=(float*)malloc(sizeof(float)*HP*WP); float *Md,*Nd,*Pd; gpuerrorchk( cudaMalloc((void**)&Md,HM*WM*sizeof(float)) ); gpuerrorchk( cudaMalloc((void**)&Nd,HN*WN*sizeof(float)) ); gpuerrorchk( cudaMalloc((void**)&Pd,HP*WP*sizeof(float)) ); gpuerrorchk( cudaMemcpy(Md,M,HM*WM*sizeof(float),cudaMemcpyHostToDevice) ); gpuerrorchk( cudaMemcpy(Nd,N,HN*WN*sizeof(float),cudaMemcpyHostToDevice) ); dim3 dimBlock(blocksize,blocksize);//(tile_width , tile_width); dim3 dimGrid(WN/dimBlock.x,HM/dimBlock.y);//(width/tile_width , width/tile_witdh); gpuerrorchk( cudaEventRecord(evstart,0) ); nonsquare<<<dimGrid,dimBlock>>>(Md,Nd,Pd,WM, WN); gpuerrorchk( cudaPeekAtLastError() ); gpuerrorchk( cudaEventRecord(evstop,0) ); gpuerrorchk( cudaEventSynchronize(evstop) ); float time; cudaEventElapsedTime(&time,evstart,evstop); gpuerrorchk( cudaMemcpy(P,Pd,WP*HP*sizeof(float),cudaMemcpyDeviceToHost) ); cudaFree(Md); cudaFree(Nd); cudaFree(Pd); float gflops=(2.e-6*WM*WM*WM)/(time); cudaThreadExit(); return gflops; }
(do not pay attention to the actual code, except that it commits memory transactions and starts the kernel, this is nonsense otherwise).
Compiling the code as follows:
cuda:~$ nvcc -arch=sm_20 -c -o cudamain.o cudamain.cu cuda:~$ g++ -o qtprob -I/usr/include/qt4 qtprob.cc cudamain.o -L $CUDA_INSTALL_PATH/lib64 -lQtCore -lcuda -lcudart cuda:~$ ldd qtprob linux-vdso.so.1 => (0x00007fff242c8000) libQtCore.so.4 => /opt/cuda-3.2/computeprof/bin/libQtCore.so.4 (0x00007fbe62344000) libcuda.so.1 => /usr/lib/libcuda.so.1 (0x00007fbe61a3d000) libcudart.so.3 => /opt/cuda-3.2/lib64/libcudart.so.3 (0x00007fbe617ef000) libstdc++.so.6 => /usr/lib/libstdc++.so.6 (0x00007fbe614db000) libm.so.6 => /lib/libm.so.6 (0x00007fbe61258000) libgcc_s.so.1 => /lib/libgcc_s.so.1 (0x00007fbe61040000) libc.so.6 => /lib/libc.so.6 (0x00007fbe60cbd000) libz.so.1 => /lib/libz.so.1 (0x00007fbe60aa6000) libgthread-2.0.so.0 => /usr/lib/libgthread-2.0.so.0 (0x00007fbe608a0000) libglib-2.0.so.0 => /lib/libglib-2.0.so.0 (0x00007fbe605c2000) librt.so.1 => /lib/librt.so.1 (0x00007fbe603ba000) libpthread.so.0 => /lib/libpthread.so.0 (0x00007fbe6019c000) libdl.so.2 => /lib/libdl.so.2 (0x00007fbe5ff98000) /lib64/ld-linux-x86-64.so.2 (0x00007fbe626c0000) libpcre.so.3 => /lib/libpcre.so.3 (0x00007fbe5fd69000)
creates an executable file that processes without errors as many times as I need to run using the CUDA 3.2 profiler.
All I can advise is to try my example for playback and see if it works or not. If this fails, you may have either a broken CUDA installation or QT. If this does not work (and I suspect that it is not), then you will have a problem with how you build the QT project or the actual CUDA code that you use yourself.