I think the short answer to your question (the ability to use a single cufftPlanMany to execute 1D FFT columns of a 3D matrix) is NO.
Indeed, conversions made according to cufftPlanMany , which you name
cufftPlanMany(&handle, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2C, batch);
must obey Advanced Data Composition . In particular, 1D FFTs are designed according to the following layout
input[b * idist + x * istride]
where b addresses the istride signal, and istride is the distance between two consecutive elements in the same signal. If the 3D matrix has dimensions M * N * Q , and if you want to perform one-dimensional transformations along the columns, the distance between two consecutive elements will be M , and the distance between two consecutive signals will be 1 . In addition, the number of batch versions must be set to M With these parameters, you can cover only one fragment of the 3D matrix. Indeed, if you try to increase M , then cuFFT will start trying to calculate the new columnar FFTs starting from the second line. The only solution to this problem is to iteratively call cufftExecC2C to cover all Q fragments.
For writing, the following code is a fully processed example of how to perform 1D FFT of 3D matrix columns.
#include <thrust/device_vector.h> #include <cufft.h> /********************/ /* CUDA ERROR CHECK */ /********************/ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } int main() { const int M = 3; const int N = 4; const int Q = 2; thrust::host_vector<float2> h_matrix(M * N * Q); for (int k=0; k<Q; k++) for (int j=0; j<N; j++) for (int i=0; i<M; i++) { float2 temp; temp.x = (float)(j + k * M); //temp.x = 1.f; temp.y = 0.f; h_matrix[k*M*N+j*M+i] = temp; printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); } printf("\n"); thrust::device_vector<float2> d_matrix(h_matrix); thrust::device_vector<float2> d_matrix_out(M * N * Q); // --- Advanced data layout // input[b * idist + x * istride] // output[b * odist + x * ostride] // b = signal number // x = element of the b-th signal cufftHandle handle; int rank = 1; // --- 1D FFTs int n[] = { N }; // --- Size of the Fourier transform int istride = M, ostride = M; // --- Distance between two successive input/output elements int idist = 1, odist = 1; // --- Distance between batches int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms) int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms) int batch = M; // --- Number of batched executions cufftPlanMany(&handle, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2C, batch); for (int k=0; k<Q; k++) cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data()) + k * M * N), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data()) + k * M * N), CUFFT_FORWARD); cufftDestroy(handle); for (int k=0; k<Q; k++) for (int j=0; j<N; j++) for (int i=0; i<M; i++) { float2 temp = d_matrix_out[k*M*N+j*M+i]; printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); } }
The situation is different for the case when you want to perform 1D string conversions. In this case, the distance between two consecutive elements is 1 , and the distance between two consecutive signals M This allows you to install multiple N * Q conversions, and then call cufftExecC2C only once. For the record, the code below provides a complete example of 1D transformations of the rows of a 3D matrix.
#include <thrust/device_vector.h> #include <cufft.h> /********************/ /* CUDA ERROR CHECK */ /********************/ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } int main() { const int M = 3; const int N = 4; const int Q = 2; thrust::host_vector<float2> h_matrix(M * N * Q); for (int k=0; k<Q; k++) for (int j=0; j<N; j++) for (int i=0; i<M; i++) { float2 temp; temp.x = (float)(j + k * M); //temp.x = 1.f; temp.y = 0.f; h_matrix[k*M*N+j*M+i] = temp; printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); } printf("\n"); thrust::device_vector<float2> d_matrix(h_matrix); thrust::device_vector<float2> d_matrix_out(M * N * Q); // --- Advanced data layout // input[b * idist + x * istride] // output[b * odist + x * ostride] // b = signal number // x = element of the b-th signal cufftHandle handle; int rank = 1; // --- 1D FFTs int n[] = { M }; // --- Size of the Fourier transform int istride = 1, ostride = 1; // --- Distance between two successive input/output elements int idist = M, odist = M; // --- Distance between batches int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms) int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms) int batch = N * Q; // --- Number of batched executions cufftPlanMany(&handle, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2C, batch); cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data())), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data())), CUFFT_FORWARD); cufftDestroy(handle); for (int k=0; k<Q; k++) for (int j=0; j<N; j++) for (int i=0; i<M; i++) { float2 temp = d_matrix_out[k*M*N+j*M+i]; printf("%i %i %i %f %f\n", i, j, k, temp.x, temp.y); } }