Float1 vs float in CUDA

I noticed that in cuda there is a float1 structure float1 . Is there a performance advantage over a simple float , for example, when using a float array vs float1 array ?

 struct __device_builtin__ float1 { float x; }; 

There is a float4 advantage in float4 , depending on the case, since alignment is 4x4bytes = 16bytes. Is it just for special use in __device__ functions with parameters float1 ?

Thanks in advance.

+6
source share
1 answer

Following @talonmies comment on the CUDA post Reducing traction using double2 arrays , I compared vector rate calculation using CUDA Thrust and switching between float and float1 . I examined an array of N=1000000 elements on a GT210 map (cc 1.2). It seems that calculating the norm takes exactly the same time for both cases, namely about 3.4s , so there is no performance improvement. As you can see from the code below, it is possible that float little more convenient to use than float1 .

Finally, note that the advantage of float4 stems from the alignment of __builtin__align__ , rather than __device_builtin__ .

 #include <thrust\device_vector.h> #include <thrust\transform_reduce.h> struct square { __host__ __device__ float operator()(float x) { return x * x; } }; struct square1 { __host__ __device__ float operator()(float1 x) { return xx * xx; } }; void main() { const int N = 1000000; float time; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); thrust::device_vector<float> d_vec(N,3.f); cudaEventRecord(start, 0); float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("Elapsed time reduction: %3.1f ms \n", time); printf("Result of reduction = %f\n",reduction); thrust::host_vector<float1> h_vec1(N); for (int i=0; i<N; i++) h_vec1[i].x = 3.f; thrust::device_vector<float1> d_vec1=h_vec1; cudaEventRecord(start, 0); float reduction1 = sqrt(thrust::transform_reduce(d_vec1.begin(), d_vec1.end(), square1(), 0.0f, thrust::plus<float>())); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("Elapsed time reduction1: %3.1f ms \n", time); printf("Result of reduction1 = %f\n",reduction1); getchar(); } 
+1
source

Source: https://habr.com/ru/post/970752/


All Articles