cudaThreadSynchronize , cudaStreamSynchronize, cudaEventSynchronize, .
, , .
, ? :
kernel <<< gridDim, blockDim >>> (d_data, DATA_STEP);
host_func1();
cudaThreadSynchronize();
host_func2();
, host_func1 . , , host_func2().
, , ?
kernel1 <<<gridDim, blockDim >>> (d_data + d1, DATA_STEP);
kernel2 <<<gridDim, blockDim >>> (d_data + d2, DATA_STEP);
kernel1 ! , kernel2 1! kernel2 1, 0 ( ). :
kernel1 <<<gridDim, blockDim>>> (d_data + d1, DATA_STEP);
cudaThreadSynchronize();
kernel2 <<<gridDim, blockDim>>> (d_data + d2, DATA_STEP);
, , .
, , , , ... , ( ). , , . :
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
cudaMemcpyAsync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, 0);
kernel<<<gridDim, blockDim, smSize, 0>>>(d_data, DATA_STEP);
}
, . .
. memcpy . 1 2, ! . , memcpy/kernel, . , , , .
, , , 1, , . , . , , , . - ...
cudaStream_t steams[S_N];
for (int i = 0; i < S_N; i++) {
cudaStreamCreate(streams[i]);
}
int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
cudaMemcpyAsync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
sid = ++sid % S_N;
}
.