Following @talonmies comment on the CUDA post Reducing traction using double2 arrays , I compared vector rate calculation using CUDA Thrust and switching between float and float1 . I examined an array of N=1000000 elements on a GT210 map (cc 1.2). It seems that calculating the norm takes exactly the same time for both cases, namely about 3.4s , so there is no performance improvement. As you can see from the code below, it is possible that float little more convenient to use than float1 .
Finally, note that the advantage of float4 stems from the alignment of __builtin__align__ , rather than __device_builtin__ .
#include <thrust\device_vector.h> #include <thrust\transform_reduce.h> struct square { __host__ __device__ float operator()(float x) { return x * x; } }; struct square1 { __host__ __device__ float operator()(float1 x) { return xx * xx; } }; void main() { const int N = 1000000; float time; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); thrust::device_vector<float> d_vec(N,3.f); cudaEventRecord(start, 0); float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("Elapsed time reduction: %3.1f ms \n", time); printf("Result of reduction = %f\n",reduction); thrust::host_vector<float1> h_vec1(N); for (int i=0; i<N; i++) h_vec1[i].x = 3.f; thrust::device_vector<float1> d_vec1=h_vec1; cudaEventRecord(start, 0); float reduction1 = sqrt(thrust::transform_reduce(d_vec1.begin(), d_vec1.end(), square1(), 0.0f, thrust::plus<float>())); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("Elapsed time reduction1: %3.1f ms \n", time); printf("Result of reduction1 = %f\n",reduction1); getchar(); }
source share