Cuda understanding concurrent kernel execution

I am trying to understand how parallel kernel execution works. I wrote a simple program to try to understand it. The kernel will populate a 2D array using 2 streams. I get the correct results when there is 1 thread, no concurrency. when i try to use 2 threads try concurrency i get wrong results. I believe that this is either related to the transfer of memory, since I'm not entirely sure that I have it correctly or how I configured the kernel. The programming guide does not explain this well enough for me. For my purposes, I need Matlab to call the kernel.

As I understand it, the main program will be:

  • allocate fixed memory on the host
  • allocate memory on the GPU needed for one thread (2 threads = half the total host memory)
  • create threads
  • loop through threads
  • copy memory for one thread from the host to the device using cudaMemcpyAsync ()
  • execute kernel for thread
  • copy the memory for the stream back to the host, cudaMemcpyAsync ()
    • I believe that I am doing it right by referring to the memory from the place that I need for each stream, using an offset based on the size of the data for each stream and stream number.
  • destroy threads
  • free memory

here is the code i am trying to use.

concurrentKernel.cpp

__global__ void concurrentKernel(int const width, int const streamIdx, double *array) { int thread = (blockIdx.x * blockDim.x) + threadIdx.x;; for (int i = 0; i < width; i ++) { array[thread*width+i] = thread+i*width+1; // array[thread*width+i+streamIdx] = thread+i*width+streamIdx*width/2; } } 

concurrentMexFunction.cu

 #include <stdio.h> #include <math.h> #include "mex.h" /* Kernel function */ #include "concurrentKernel.cpp" void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray *prhs[]) { int const numberOfStreams = 2; // set number of streams to use here. cudaError_t cudaError; int offset; int width, height, fullSize, streamSize; width = 512; height = 512; fullSize = height*width; streamSize = (int)(fullSize/numberOfStreams); mexPrintf("fullSize: %d, streamSize: %d\n",fullSize, streamSize); /* Return the populated array */ double *returnedArray; plhs[0] = mxCreateDoubleMatrix(height, width, mxREAL); returnedArray = mxGetPr(plhs[0]); cudaStream_t stream[numberOfStreams]; for (int i = 0; i < numberOfStreams; i++) { cudaStreamCreate(&stream[i]); } /* host memory */ double *hostArray; cudaError = cudaMallocHost(&hostArray,sizeof(double)*fullSize); // full size of array. if (cudaError != cudaSuccess) {mexPrintf("hostArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { hostArray[i*width+j] = -1.0; } } /* device memory */ double *deviceArray; cudaError = cudaMalloc( (void **)&deviceArray,sizeof(double)*streamSize); // size of array for each stream. if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } for (int i = 0; i < numberOfStreams; i++) { offset = i;//*streamSize; mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray); cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]); } for (int i = 0; i < numberOfStreams; i++) { cudaStreamDestroy(stream[i]); } cudaFree(hostArray); cudaFree(deviceArray); } 

When there are 2 threads, the result is an array of zeros, which makes me think that I am doing something wrong with memory. Can someone explain what I'm doing wrong? If someone needs help compiling and running them from Matlab, I can provide commands for this.

Update:

 for (int i = 0; i < numberOfStreams; i++) { offset = i*streamSize; mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray); } cudaDeviceSynchronize(); for (int i = 0; i < numberOfStreams; i++) { offset = i*streamSize; mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]); cudaStreamDestroy(stream[i]); } 
+4
source share
2 answers

You need to keep in mind that the APIs you use with threads are completely asynchronous, so control returns immediately to the calling thread. If you do not insert some kind of synchronization point between the GPU performing the asynchronous operations and the host, there is no guarantee that the operations that you checked in the threads are actually complete. In your example, this means that the following is required:

 for (int i = 0; i < numberOfStreams; i++) { offset = i;//*streamSize; mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]); concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray); cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]); } // Host thread waits here until both kernels and copies are finished cudaDeviceSynchronize(); for (int i = 0; i < numberOfStreams; i++) { mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]); cudaStreamDestroy(stream[i]); } 

The key point here is that you need to make sure that both memory transfers are complete before you try to check the results in the host memory. Neither your source code nor your update do this.

+6
source

Also, it looks like you are reusing the deviceArray pointer for different parallel threads. Most likely, if the current code works as it is, it is due to false dependencies that @Tom mentions, causing the hardware to run threads sequentially. You really should have a separate deviceArray for the stream:

 /* device memory */ double *deviceArray[numberOfStreams]; for (int i = 0; i < numberOfStreams; i++) { cudaError = cudaMalloc( (void **)&deviceArray[i],sizeof(double)*streamSize); // size of array for each stream. if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } } for (int i = 0; i < numberOfStreams; i++) { offset = i;//*streamSize; mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); cudaMemcpyAsync(deviceArray[i], hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray[i]); cudaMemcpyAsync(returnedArray+offset, deviceArray[i], sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]); } 
+1
source

Source: https://habr.com/ru/post/1433271/


All Articles