, ... N :
cudaStream_t streams;
streams = malloc(N * sizeof(cudaStream_t));
for(i=0; i<N; i++)
{
cudaStreamCreate(&streams[i]);
}
i- cudaMemcpyAsync :
cudaMemcpyAsync(dst, src, kind, count, streams[i]);
(sharedMemory 0, ):
kernel_1 <<< nBlocks, nThreads, sharedMemory, streams[i] >>> ( args );
kernel_2 <<< nBlocks, nThreads, sharedMemory, streams[i] >>> ( args );
, , :
for(i=0; i<N; i++)
{
cudaStreamDestroy(streams[i]);
}
free(streams)