The CUBLAS documentation mentions that we need synchronization before reading the scalar result:
"In addition, some functions that return a scalar result, such as amax (), amin, asum (), rotg (), rotmg (), dot () and nrm2 (), return the result value by reference to the host or device. Please attention that even if these functions are returned immediately, like the results of a matrix and a vector, the scalar result is ready only when the procedure on the GPU is completed. This requires proper synchronization in order to read the result from the host. "
Does this mean that we should always synchronize before reading the scalar result from the host, even if we use only one stream? I searched for an example in the NVIDIA CUDA documentation, but could not find it.
But in the conjugate gradient example provided by NVIDIA, there are the following codes
while (r1 > tol*tol && k <= max_iter)
{
if (k > 1)
{
b = r1 / r0;
cublasStatus = cublasSscal(cublasHandle, N, &b, d_p, 1);
cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1);
}
else
{
cublasStatus = cublasScopy(cublasHandle, N, d_r, 1, d_p, 1);
}
cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax);
cublasStatus = cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, &dot);
a = r1 / dot;
cublasStatus = cublasSaxpy(cublasHandle, N, &a, d_p, 1, d_x, 1);
na = -a;
cublasStatus = cublasSaxpy(cublasHandle, N, &na, d_Ax, 1, d_r, 1);
r0 = r1;
cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);
cudaThreadSynchronize();
printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
k++;
}
There is a call to cudaThreadSynchronize () until the end of the while loop. Is this for cublasSdot calls? But there are two calls to cublasSdot in the loop. Why does cudaThreadSynchronize () exist after the second cublasSdot, but not the first?
EDIT: To find out what is happening, I used the following codes to compare product results with and without synchronization.
int main(int argc, char **argv)
{
int N = 1024 * 1024 * 512;
double *x_cpu = (double *)malloc(sizeof(double)*N);
for (int i = 0; i < N; i++)
{
x_cpu[i] = double(rand()) / RAND_MAX;
}
double *x_gpu;
cudaMalloc((void **)&x_gpu, N*sizeof(double));
cudaMemcpy(x_gpu, x_cpu, N*sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
int M = 1000;
std::vector<double> x_dot_vec(M, 0.0);
double *x_dot_ptr = &(x_dot_vec[0]);
std::cout << "Begin Launching CUBLAS........" << std::endl;
for(int j = 0; j < M; j++){
cublasDdot(cublasHandle, N, x_gpu, 1, x_gpu, 1, x_dot_ptr + j);
}
std::cout << "End Launching CUBLAS........." << std::endl;
double old_value = x_dot_vec.back();
cudaDeviceSynchronize();
double new_value = x_dot_vec.back();
std::cout << "Old Value: " << old_value << ", New Value: " << new_value << std::endl;
free(x_cpu);
cudaFree(x_gpu);
return 0;
}
, cublas . cublas . cublasDdot , . . , , , . , , . cublas . , cublasDdot , , CUBLAS.
, . .
int main(int argc, char **argv)
{
int N = 1024 * 1024 * 512;
double *x_cpu = (double *)malloc(sizeof(double)*N);
for (int i = 0; i < N; i++)
{
x_cpu[i] = double(rand()) / RAND_MAX;
}
double *x_gpu;
cudaMalloc((void **)&x_gpu, N*sizeof(double));
cudaMemcpy(x_gpu, x_cpu, N*sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
int M = 1000;
std::vector<double> x_dot_vec(M, 0.0);
double *x_dot_ptr = &(x_dot_vec[0]);
double *dot_gpu;
cudaMalloc((void **)&dot_gpu, sizeof(double) * M);
cudaMemcpy(dot_gpu, x_dot_ptr, M * sizeof(double), cudaMemcpyHostToDevice);
double old_value, new_value;
std::cout << "Begin Launching CUBLAS........" << std::endl;
for(int j = 0; j < M; j++){
cublasDdot(cublasHandle, N, x_gpu, 1, x_gpu, 1, dot_gpu + j);
}
std::cout << "End Launching CUBLAS........." << std::endl;
cudaMemcpy(&old_value, dot_gpu + M - 1, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaMemcpy(&new_value, dot_gpu + M - 1, sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "Old Value: " << old_value << ", New Value: " << new_value << std::endl;
free(x_cpu);
cudaFree(x_gpu);
cudaFree(dot_gpu);
return 0;
}