I want to make array reduction using OpenMP and SIMD. I read that shortening OpenMP is equivalent to:
inline float sum_scalar_openmp2(const float a[], const size_t N) { float sum = 0.0f; #pragma omp parallel { float sum_private = 0.0f; #pragma omp parallel for nowait for(int i=0; i<N; i++) { sum_private += a[i]; } #pragma omp atomic sum += sum_private; } return sum; }
I got this idea from the following link: http://bisqwit.iki.fi/story/howto/openmp/#ReductionClause But atomic also does not support complex operators. What I did was replace atomic criticism and implement reduction using OpenMP and SSE as follows:
#define ROUND_DOWN(x, s) ((x) & ~((s)-1)) inline float sum_vector4_openmp(const float a[], const size_t N) { __m128 sum4 = _mm_set1_ps(0.0f);
However, this feature does not work as well as I hope. I am using Visual Studio 2012 Express. I know that I can improve performance a bit by deploying SSE load / add several times, but this is still less than what I expect.
I get much better performance by going through slices of arrays equal to the number of threads:
inline float sum_slice(const float a[], const size_t N) { int nthreads = 4; const int offset = ROUND_DOWN(N/nthreads, nthreads); float suma[8] = {0}; #pragma omp parallel for num_threads(nthreads) for(int i=0; i<nthreads; i++) { suma[i] = sum_vector4(&a[i*offset], offset); } float sum = 0.0f; for(int i=0; i<nthreads; i++) { sum += suma[i]; } for(int i=nthreads*offset; i < N; i++) { sum += a[i]; } return sum; } inline float sum_vector4(const float a[], const size_t N) { __m128 sum4 = _mm_set1_ps(0.0f); int i = 0; for(; i < ROUND_DOWN(N, 4); i+=4) { __m128 a4 = _mm_load_ps(a + i); sum4 = _mm_add_ps(sum4, a4); } __m128 t1 = _mm_hadd_ps(sum4,sum4); __m128 t2 = _mm_hadd_ps(t1,t1); float sum = _mm_cvtss_f32(t2); for(; i < N; i++) { sum += a[i]; } return sum;
}
Does anyone know if there is a better way to do reductions with more complex operators in OpenMP?
user2088790
source share