I wrote some code to try changing the quadrants of a 2D matrix for FFT purposes, which are stored in a flat array.
int leftover = W-dcW; T *temp; T *topHalf; cudaMalloc((void **)&temp, dcW * sizeof(T)); //swap every row, left and right for(int i = 0; i < H; i++) { cudaMemcpy(temp, &data[i*W], dcW*sizeof(T),cudaMemcpyDeviceToDevice); cudaMemcpy(&data[i*W],&data[i*W+dcW], leftover*sizeof(T), cudaMemcpyDeviceToDevice); cudaMemcpy(&data[i*W+leftover], temp, dcW*sizeof(T), cudaMemcpyDeviceToDevice); } cudaMalloc((void **)&topHalf, dcH*W* sizeof(T)); leftover = H-dcH; cudaMemcpy(topHalf, data, dcH*W*sizeof(T), cudaMemcpyDeviceToDevice); cudaMemcpy(data, &data[dcH*W], leftover*W*sizeof(T), cudaMemcpyDeviceToDevice); cudaMemcpy(&data[leftover*W], topHalf, dcH*W*sizeof(T), cudaMemcpyDeviceToDevice);
Please note that this code contains pointers to devices and carries DeviceToDevice.
Why does it seem slow? Could this be somehow optimized? I calculated this compared to the same operation on the host using regular memcpy, and it was about 2 times slower.
Any ideas?
Derek source share