As stated in the comments, I don't think this is possible with thrust::copy . However, we can use cudaMemcpyAsync in the traction application to achieve the goal of asynchronous copies and overlap the copy with the calculation.
Here is an example:
#include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/system/cuda/experimental/pinned_allocator.h> #include <thrust/system/cuda/execution_policy.h> #include <thrust/fill.h> #include <thrust/sequence.h> #include <thrust/for_each.h> #include <iostream> // DSIZE determines duration of H2D and D2H transfers #define DSIZE (1048576*8) // SSIZE,LSIZE determine duration of kernel launched by thrust #define SSIZE (1024*512) #define LSIZE 1 // KSIZE determines size of thrust kernels (number of threads per block) #define KSIZE 64 #define TV1 1 #define TV2 2 typedef int mytype; typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector; struct sum_functor { mytype *dptr; sum_functor(mytype* _dptr) : dptr(_dptr) {}; __host__ __device__ void operator()(mytype &data) const { mytype result = data; for (int j = 0; j < LSIZE; j++) for (int i = 0; i < SSIZE; i++) result += dptr[i]; data = result; } }; int main(){ pinnedVector hi1(DSIZE); pinnedVector hi2(DSIZE); pinnedVector ho1(DSIZE); pinnedVector ho2(DSIZE); thrust::device_vector<mytype> di1(DSIZE); thrust::device_vector<mytype> di2(DSIZE); thrust::device_vector<mytype> do1(DSIZE); thrust::device_vector<mytype> do2(DSIZE); thrust::device_vector<mytype> dc1(KSIZE); thrust::device_vector<mytype> dc2(KSIZE); thrust::fill(hi1.begin(), hi1.end(), TV1); thrust::fill(hi2.begin(), hi2.end(), TV2); thrust::sequence(do1.begin(), do1.end()); thrust::sequence(do2.begin(), do2.end()); cudaStream_t s1, s2; cudaStreamCreate(&s1); cudaStreamCreate(&s2); cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1); cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2); thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data()))); thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data()))); cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1); cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2); cudaDeviceSynchronize(); for (int i=0; i < KSIZE; i++){ if (ho1[i] != ((LSIZE*SSIZE*TV1) + i)) { std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;} if (ho2[i] != ((LSIZE*SSIZE*TV2) + i)) { std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;} } std::cout << "Success!" << std::endl; return 0; }
In my test case, I used RHEL5.5, Quadro5000 and cuda 6.5RC. This example is designed to create very small kernels (only one thread block, while KSIZE small, say 32 or 64), so that the kernels that create push from thrust::for_each can start at the same time.
When I look at this code, I see:

This indicates that we are achieving a proper match both between traction kernels and between copy operations and traction kernels, as well as asynchronous copying of data upon completion of the kernels. Note that the cudaDeviceSynchronize() operation “fills” the timeline, indicating that all asynchronous operations (copying data, traction functions) were issued asynchronously, and control was returned to the host stream before any of these operations were performed. All this is expected, the correct behavior for full concurrency between the host, GPU and data copy operations.