How to asynchronously copy memory from a host to a device using traction and CUDA streams

Question

How to asynchronously copy memory from a host to a device using traction and CUDA streams

I would like to copy the memory from the host to the device using traction, as in

thrust::host_vector<float> h_vec(1 << 28); thrust::device_vector<float> d_vec(1 << 28); thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());

using CUDA streams similar to how you copy memory from device to device using streams:

 cudaStream_t s; cudaStreamCreate(&s); thrust::device_vector<float> d_vec1(1 << 28), d_vec2(1 << 28); thrust::copy(thrust::cuda::par.on(s), d_vec1.begin(), d_vec1.end(), d_vec2.begin()); cudaStreamSynchronize(s); cudaStreamDestroy(s);

The problem is that I cannot set the execution policy for CUDA to specify the stream when copying from the host to the device, because in this case the emphasis will assume that both vectors are stored on the device. Is there any way around this problem? I am using the latest version of thrust from github (it says 1.8 in the version.h file).

+6

c ++ asynchronous cuda thrust

Tom Jul 31 '14 at 16:33

source share

2 answers

Here is an example using thrust::cuda::experimental::pinned_allocator<T> :

 // Compile with: // nvcc --std=c++11 mem_async.cu -o mem_async #include <cuda.h> #include <cuda_runtime.h> #include <cufft.h> #include <thrust/device_vector.h> #include <thrust/host_vector.h> #include <thrust/fill.h> #include <thrust/system/cuda/experimental/pinned_allocator.h> #define LEN 1024 int main(int argc, char *argv[]) { thrust::host_vector<float, thrust::cuda::experimental::pinned_allocator<float>> h_vec(LEN); thrust::device_vector<float> d_vec(LEN); thrust::fill(d_vec.begin(), d_vec.end(), -1.0); cudaMemcpyAsync(thrust::raw_pointer_cast(h_vec.data()), thrust::raw_pointer_cast(d_vec.data()), d_vec.size()*sizeof(float), cudaMemcpyDeviceToHost); // Comment out this line to see what happens. cudaDeviceSynchronize(); std::cout << h_vec[0] << std::endl; }

Comment out the synchronization step and you should get 0 printed on the console due to asynchronous memory transfer.

+1

Andreas Yankopolus Oct 26 '16 at 14:46

source share

Robert Crovella · Accepted Answer · 2014-07-31T20:30:19+0000

As stated in the comments, I don't think this is possible with thrust::copy . However, we can use cudaMemcpyAsync in the traction application to achieve the goal of asynchronous copies and overlap the copy with the calculation.

Here is an example:

 #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/system/cuda/experimental/pinned_allocator.h> #include <thrust/system/cuda/execution_policy.h> #include <thrust/fill.h> #include <thrust/sequence.h> #include <thrust/for_each.h> #include <iostream> // DSIZE determines duration of H2D and D2H transfers #define DSIZE (1048576*8) // SSIZE,LSIZE determine duration of kernel launched by thrust #define SSIZE (1024*512) #define LSIZE 1 // KSIZE determines size of thrust kernels (number of threads per block) #define KSIZE 64 #define TV1 1 #define TV2 2 typedef int mytype; typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector; struct sum_functor { mytype *dptr; sum_functor(mytype* _dptr) : dptr(_dptr) {}; __host__ __device__ void operator()(mytype &data) const { mytype result = data; for (int j = 0; j < LSIZE; j++) for (int i = 0; i < SSIZE; i++) result += dptr[i]; data = result; } }; int main(){ pinnedVector hi1(DSIZE); pinnedVector hi2(DSIZE); pinnedVector ho1(DSIZE); pinnedVector ho2(DSIZE); thrust::device_vector<mytype> di1(DSIZE); thrust::device_vector<mytype> di2(DSIZE); thrust::device_vector<mytype> do1(DSIZE); thrust::device_vector<mytype> do2(DSIZE); thrust::device_vector<mytype> dc1(KSIZE); thrust::device_vector<mytype> dc2(KSIZE); thrust::fill(hi1.begin(), hi1.end(), TV1); thrust::fill(hi2.begin(), hi2.end(), TV2); thrust::sequence(do1.begin(), do1.end()); thrust::sequence(do2.begin(), do2.end()); cudaStream_t s1, s2; cudaStreamCreate(&s1); cudaStreamCreate(&s2); cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1); cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2); thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data()))); thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data()))); cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1); cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2); cudaDeviceSynchronize(); for (int i=0; i < KSIZE; i++){ if (ho1[i] != ((LSIZE*SSIZE*TV1) + i)) { std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;} if (ho2[i] != ((LSIZE*SSIZE*TV2) + i)) { std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;} } std::cout << "Success!" << std::endl; return 0; }

In my test case, I used RHEL5.5, Quadro5000 and cuda 6.5RC. This example is designed to create very small kernels (only one thread block, while KSIZE small, say 32 or 64), so that the kernels that create push from thrust::for_each can start at the same time.

When I look at this code, I see:

This indicates that we are achieving a proper match both between traction kernels and between copy operations and traction kernels, as well as asynchronous copying of data upon completion of the kernels. Note that the cudaDeviceSynchronize() operation “fills” the timeline, indicating that all asynchronous operations (copying data, traction functions) were issued asynchronously, and control was returned to the host stream before any of these operations were performed. All this is expected, the correct behavior for full concurrency between the host, GPU and data copy operations.

How to asynchronously copy memory from a host to a device using traction and CUDA streams

More articles: