Get rid of busy pending while running asynchronous cuda threads

I am looking for a way to get rid of waiting in the host thread in parrying the code (do not copy this code, it only shows my problem, it has many basic errors):

cudaStream_t steams[S_N];
for (int i = 0; i < S_N; i++) {
    cudaStreamCreate(streams[i]);
}
int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
     while (true) {
         if (cudaStreamQuery(streams[sid])) == cudaSuccess) { //BUSY WAITING !!!!
             cudaMemcpyAssync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
             kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
             break;
         }
         sid = ++sid % S_N;
     }

}

Is there a way to idle a host thread and somehow wait for some thread, and then prepare and start another thread?

EDIT: I added (true) to the code to emphasize the expectation. Now I execute all the threads and check which ones have finished to start another one. cudaStreamSynchronizewaiting for the completion of a specific thread, but I want to wait for any of the threads that completed the task for the first time.

EDIT2: I got rid of lively waiting:

cudaStream_t steams[S_N];
for (int i = 0; i < S_N; i++) {
    cudaStreamCreate(streams[i]);
}
int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
    cudaMemcpyAssync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
    kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
    sid = ++sid % S_N;
}
for (int i = 0; i < S_N; i++) {
    cudaStreamSynchronize(streams[i]);
    cudaStreamDestroy(streams[i]);
}

, . , , , , , , . , , - .

+3
5

- . - cudaStreamSynchronize, . , CUDA 3.2, CUDA, GPU CUDA.

, CUDA 4.0 : CUDA 4.0 RC

EDIT: CUDA 4.0 RC, mp. cuda. .

+3

cudaThreadSynchronize , cudaStreamSynchronize, cudaEventSynchronize, .

, , .


, ? :

kernel <<< gridDim, blockDim >>> (d_data, DATA_STEP);
host_func1();
cudaThreadSynchronize();
host_func2();

, host_func1 . , , host_func2().

, , ?

kernel1 <<<gridDim, blockDim >>> (d_data + d1, DATA_STEP);
kernel2 <<<gridDim, blockDim >>> (d_data + d2, DATA_STEP);

kernel1 ! , kernel2 1! kernel2 1, 0 ( ). :

kernel1 <<<gridDim, blockDim>>> (d_data + d1, DATA_STEP);
cudaThreadSynchronize();
kernel2 <<<gridDim, blockDim>>> (d_data + d2, DATA_STEP);

, , .

, , , , ... ​​ , ( ). , , . :

for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
    cudaMemcpyAsync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, 0);
    kernel<<<gridDim, blockDim, smSize, 0>>>(d_data, DATA_STEP);
 }

, . .

. memcpy . 1 2, ! . , memcpy/kernel, . , , , .

, , , 1, , . , . , , , . - ...

cudaStream_t steams[S_N];
for (int i = 0; i < S_N; i++) {
    cudaStreamCreate(streams[i]);
}

int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
     cudaMemcpyAsync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
     kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
     sid = ++sid % S_N;
 }

.

+4

: cudaEventRecord(event, stream) cudaEventSynchronize(event). http://developer.download.nvidia.com/compute/cuda/3_2/toolkit/docs/CUDA_Toolkit_Reference_Manual.pdf .

: BTW . , ?

+3

cudaStreamQuery cudaStreamSynchronize

int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
     cudaStreamSynchronize(streams[sid]);
     cudaMemcpyAssync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
     kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
     sid = ++sid % S_N;
}

( cudaThreadSynchronize cudaEventSynchronize /.)

You can further control the type of wait that occurs with these synchronization features. See the reference guide for the flag cudaDeviceBlockingSync and others. By default, probably you want.

+1
source

You need to copy a piece of data and execute the kernel on this data block in different loops . It will be more efficient.

like this:

size = N*sizeof(float)/nStreams;

for (i=0; i<nStreams; i++){
offset = i*N/nStreams;
cudaMemcpyAsync(a_d+offset, a_h+offset, size, cudaMemcpyHostToDevice, stream[i]);
}


for (i=0; i<nStreams; i++){
offset = i*N/nStreams;
kernel<<<N(nThreads*nStreams), nThreads, 0, stream[i]>>> (a_d+offset);
}

Thus, a copy of memory should not wait for the kernel to execute on the previous thread and vice versa.

+1
source

Source: https://habr.com/ru/post/1794489/


All Articles