Sending a 3d array to the CUDA core

Question

Sending a 3d array to the CUDA core

I took the code provided as the answer for How can I add two 2d (broken) arrays using nested loops? and tried to use it for 3D instead of 2D and slightly changed the other parts, now it looks like this:

__global__ void doSmth(int*** a) { for(int i=0; i<2; i++) for(int j=0; j<2; j++) for(int k=0; k<2; k++) a[i][j][k]=i+j+k; } int main() { int*** h_c = (int***) malloc(2*sizeof(int**)); for(int i=0; i<2; i++) { h_c[i] = (int**) malloc(2*sizeof(int*)); for(int j=0; j<2; j++) GPUerrchk(cudaMalloc((void**)&h_c[i][j],2*sizeof(int))); } int*** d_c; GPUerrchk(cudaMalloc((void****)&d_c,2*sizeof(int**))); GPUerrchk(cudaMemcpy(d_c,h_c,2*sizeof(int**),cudaMemcpyHostToDevice)); doSmth<<<1,1>>>(d_c); GPUerrchk(cudaPeekAtLastError()); int res[2][2][2]; for(int i=0; i<2; i++) for(int j=0; j<2; j++) GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],2*sizeof(int),cudaMemcpyDeviceToHost)); for(int i=0; i<2; i++) for(int j=0; j<2; j++) for(int k=0; k<2; k++) printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]); }

In the above code, I use 2 as the dimensions for each h_c dimension, in the actual implementation I will have these sizes in very large quantities and in different sizes for each part of the "int ***" subarray, or more. I am having a problem with the part after the kernel call, where I am trying to copy the results to the res array. Can you help me solve the problem? Plz you can show the solution as i write it above. Thanks!

+4

c gpu gpgpu cuda

starter Oct 16 '12 at 10:03

source share

1 answer

Robert Crovella · Accepted Answer · 2012-10-16T23:31:41+0000

First of all, I think that the claws, when he posted the answer to the previous question that you mentioned, were not going to be representative of good coding. Therefore, figuring out how to expand it to 3D may not be the best way to use your time. For example, why do we want to write programs that use only one thread? Although there may be legitimate uses of such a kernel, this is not one of them. Your kernel has the ability to do a bunch of independent work in parallel , but instead you force it all into a single thread and serialize it. Definition of parallel operation:

 a[i][j][k]=i+j+k;

Explain how to handle it in parallel on the GPU.

Another introductory note that I would like to make is that since we are dealing with problems that have dimensions that are known in advance, let us use C to solve them with the same benefit that we can get from the language. In some cases, you may need nested loops to execute cudaMalloc, but I don't think this is one of them.

Here is the code that does the parallel work:

 #include <stdio.h> #include <stdlib.h> // set a 3D volume // To compile it with nvcc execute: nvcc -O2 -o set3d set3d.cu //define the data set size (cubic volume) #define DATAXSIZE 100 #define DATAYSIZE 100 #define DATAZSIZE 20 //define the chunk sizes that each threadblock will work on #define BLKXSIZE 32 #define BLKYSIZE 4 #define BLKZSIZE 4 // for cuda error checking #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ return 1; \ } \ } while (0) // device function to set the 3D volume __global__ void set(int a[][DATAYSIZE][DATAXSIZE]) { unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; unsigned idy = blockIdx.y*blockDim.y + threadIdx.y; unsigned idz = blockIdx.z*blockDim.z + threadIdx.z; if ((idx < (DATAXSIZE)) && (idy < (DATAYSIZE)) && (idz < (DATAZSIZE))){ a[idz][idy][idx] = idz+idy+idx; } } int main(int argc, char *argv[]) { typedef int nRarray[DATAYSIZE][DATAXSIZE]; const dim3 blockSize(BLKXSIZE, BLKYSIZE, BLKZSIZE); const dim3 gridSize(((DATAXSIZE+BLKXSIZE-1)/BLKXSIZE), ((DATAYSIZE+BLKYSIZE-1)/BLKYSIZE), ((DATAZSIZE+BLKZSIZE-1)/BLKZSIZE)); // overall data set sizes const int nx = DATAXSIZE; const int ny = DATAYSIZE; const int nz = DATAZSIZE; // pointers for data set storage via malloc nRarray *c; // storage for result stored on host nRarray *d_c; // storage for result computed on device // allocate storage for data set if ((c = (nRarray *)malloc((nx*ny*nz)*sizeof(int))) == 0) {fprintf(stderr,"malloc1 Fail \n"); return 1;} // allocate GPU device buffers cudaMalloc((void **) &d_c, (nx*ny*nz)*sizeof(int)); cudaCheckErrors("Failed to allocate device buffer"); // compute result set<<<gridSize,blockSize>>>(d_c); cudaCheckErrors("Kernel launch failure"); // copy output data back to host cudaMemcpy(c, d_c, ((nx*ny*nz)*sizeof(int)), cudaMemcpyDeviceToHost); cudaCheckErrors("CUDA memcpy failure"); // and check for accuracy for (unsigned i=0; i<nz; i++) for (unsigned j=0; j<ny; j++) for (unsigned k=0; k<nx; k++) if (c[i][j][k] != (i+j+k)) { printf("Mismatch at x= %d, y= %d, z= %d Host= %d, Device = %d\n", i, j, k, (i+j+k), c[i][j][k]); return 1; } printf("Results check!\n"); free(c); cudaFree(d_c); cudaCheckErrors("cudaFree fail"); return 0; }

Since you asked for it in the comments, here are a small number of changes that I could make to your code to make it work. Let me also remind you of some claw comments from the previous question that you indicated:

"For code complexity and performance, you really don't want to do this; using pointer arrays in CUDA code is more complex and slower than the linear memory alternative."

"it's such a bad idea compared to using linear memory."

I had to draw this on paper to make sure that I had all the correct copy of the pointer.

 #include <cstdio> inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true) { if (code != 0) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line); if (Abort) exit(code); } } #define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); } __global__ void doSmth(int*** a) { for(int i=0; i<2; i++) for(int j=0; j<2; j++) for(int k=0; k<2; k++) a[i][j][k]=i+j+k; } int main() { int*** h_c = (int***) malloc(2*sizeof(int**)); for(int i=0; i<2; i++) { h_c[i] = (int**) malloc(2*sizeof(int*)); for(int j=0; j<2; j++) GPUerrchk(cudaMalloc((void**)&h_c[i][j],2*sizeof(int))); } int ***h_c1 = (int ***) malloc(2*sizeof(int **)); for (int i=0; i<2; i++){ GPUerrchk(cudaMalloc((void***)&(h_c1[i]), 2*sizeof(int*))); GPUerrchk(cudaMemcpy(h_c1[i], h_c[i], 2*sizeof(int*), cudaMemcpyHostToDevice)); } int*** d_c; GPUerrchk(cudaMalloc((void****)&d_c,2*sizeof(int**))); GPUerrchk(cudaMemcpy(d_c,h_c1,2*sizeof(int**),cudaMemcpyHostToDevice)); doSmth<<<1,1>>>(d_c); GPUerrchk(cudaPeekAtLastError()); int res[2][2][2]; for(int i=0; i<2; i++) for(int j=0; j<2; j++) GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],2*sizeof(int),cudaMemcpyDeviceToHost)); for(int i=0; i<2; i++) for(int j=0; j<2; j++) for(int k=0; k<2; k++) printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]); }

In a nutshell, we must make a sequential sequence:

malloc is a multidimensional array of pointers (on the host), one size smaller than the size of the task, with the last dimension being a set of pointers to the cudaMalloc'ed regions on the device, not on the host.
create another multidimensional array of pointers of the same class as in the previous step, but one size smaller than in the previous step. this array should also have the final cudaMalloc'ed rows on the device.
copy the last set of node pointers from the second previous step to the cudaMalloced area on the device in the previous step.
repeat steps 2-3 until we get one (host) pointer pointing to a multidimensional array of pointers, all of which are now on the device.

Sending a 3d array to the CUDA core

More articles: