I am trying to understand how parallel kernel execution works. I wrote a simple program to try to understand it. The kernel will populate a 2D array using 2 streams. I get the correct results when there is 1 thread, no concurrency. when i try to use 2 threads try concurrency i get wrong results. I believe that this is either related to the transfer of memory, since I'm not entirely sure that I have it correctly or how I configured the kernel. The programming guide does not explain this well enough for me. For my purposes, I need Matlab to call the kernel.
As I understand it, the main program will be:
- allocate fixed memory on the host
- allocate memory on the GPU needed for one thread (2 threads = half the total host memory)
- create threads
- loop through threads
- copy memory for one thread from the host to the device using cudaMemcpyAsync ()
- execute kernel for thread
- copy the memory for the stream back to the host, cudaMemcpyAsync ()
- I believe that I am doing it right by referring to the memory from the place that I need for each stream, using an offset based on the size of the data for each stream and stream number.
- destroy threads
- free memory
here is the code i am trying to use.
concurrentKernel.cpp
__global__ void concurrentKernel(int const width, int const streamIdx, double *array) { int thread = (blockIdx.x * blockDim.x) + threadIdx.x;; for (int i = 0; i < width; i ++) { array[thread*width+i] = thread+i*width+1;
concurrentMexFunction.cu
#include <stdio.h> #include <math.h> #include "mex.h" /* Kernel function */ #include "concurrentKernel.cpp" void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray *prhs[]) { int const numberOfStreams = 2; // set number of streams to use here. cudaError_t cudaError; int offset; int width, height, fullSize, streamSize; width = 512; height = 512; fullSize = height*width; streamSize = (int)(fullSize/numberOfStreams); mexPrintf("fullSize: %d, streamSize: %d\n",fullSize, streamSize); /* Return the populated array */ double *returnedArray; plhs[0] = mxCreateDoubleMatrix(height, width, mxREAL); returnedArray = mxGetPr(plhs[0]); cudaStream_t stream[numberOfStreams]; for (int i = 0; i < numberOfStreams; i++) { cudaStreamCreate(&stream[i]); } /* host memory */ double *hostArray; cudaError = cudaMallocHost(&hostArray,sizeof(double)*fullSize); // full size of array. if (cudaError != cudaSuccess) {mexPrintf("hostArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { hostArray[i*width+j] = -1.0; } } /* device memory */ double *deviceArray; cudaError = cudaMalloc( (void **)&deviceArray,sizeof(double)*streamSize); // size of array for each stream. if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } for (int i = 0; i < numberOfStreams; i++) { offset = i;//*streamSize; mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray); cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]); } for (int i = 0; i < numberOfStreams; i++) { cudaStreamDestroy(stream[i]); } cudaFree(hostArray); cudaFree(deviceArray); }
When there are 2 threads, the result is an array of zeros, which makes me think that I am doing something wrong with memory. Can someone explain what I'm doing wrong? If someone needs help compiling and running them from Matlab, I can provide commands for this.
Update:
for (int i = 0; i < numberOfStreams; i++) { offset = i*streamSize; mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray); } cudaDeviceSynchronize(); for (int i = 0; i < numberOfStreams; i++) { offset = i*streamSize; mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]); if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; } mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]); cudaStreamDestroy(stream[i]); }