I am new to Openmp and am now trying to use the Openmp + SIMD features to speed up my program, but the result is far from expected.
To simplify things without losing essential information, I wrote an example of a simpler toy:
#include <omp.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <sys/time.h>
#include "immintrin.h"
int main() {
int64_t size = 160000000;
std::vector<int> src(size);
for (int i = 0; i < size; ++i)
src[i] = (rand() / (float)RAND_MAX) * size;
std::vector<int> dst(size);
int * src_ptr = src.data();
int * dst_ptr = dst.data();
__m256i vec_src;
__m256i vec_op = _mm256_set1_epi32(2);
__m256i vec_dst;
omp_set_num_threads(4);
struct timeval one, two;
double get_time;
gettimeofday (&one, NULL);
#pragma omp parallel for private(vec_src, vec_op, vec_dst)
for (int64_t i = 0; i < size; i += 8) {
vec_src = _mm256_loadu_si256((__m256i const *)(src_ptr + i));
vec_dst = _mm256_add_epi32(vec_src, vec_op);
vec_dst = _mm256_mullo_epi32(vec_dst, vec_src);
vec_dst = _mm256_slli_epi32(vec_dst, 1);
vec_dst = _mm256_add_epi32(vec_dst, vec_src);
vec_dst = _mm256_sub_epi32(vec_dst, vec_src);
_mm256_storeu_si256((__m256i *)(dst_ptr + i), vec_dst);
}
gettimeofday(&two, NULL);
double oneD = one.tv_sec + (double)one.tv_usec * .000001;
double twoD = two.tv_sec + (double)two.tv_usec * .000001;
get_time = 1000 * (twoD - oneD);
std::cout << "took time: " << get_time << std::endl;
int64_t i = (int)((rand() / (float)RAND_MAX) * size);
for (int64_t i = 0; i < size; ++i)
std::cout << i << ": " << dst[i] << std::endl;
return 0;
}
It compiles using icpc -g -std=c++11 -march=core-avx2 -O3 -qopenmp test.cpp -o testand measures the elapsed time of the parallel part. The result is as follows (the average value is selected from 5 runs each):
1 thread: 92.519
2 threads: 89.045
4 threads: 90.361
The calculations seem awkwardly parallel, since different threads can simultaneously load their necessary data, taking into account different indices, and the case is similar for writing results, but why are there no accelerations?
Additional Information:
.
-1:
@JerryCoffin , Vtune. :
1-thread: Memory Bound: 6.5%, L1 Bound: 0.134, L3 Latency: 0.039
2-threads: Memory Bound: 18.0%, L1 Bound: 0.115, L3 Latency: 0.015
4-threads: Memory Bound: 21.6%, L1 Bound: 0.213, L3 Latency: 0.003
Intel 4770 25,6 / (23 /, Vtune) . . , , . ?
EDIT-2 ( , , ):
@PaulR @bazza. 3 . , 4 8 . :
(1) dst, : 1 thread: 91.922; 2 threads: 93.170; 4 threads: 93.868 --- ;
(2) (1), 100 100 : 1 thread: 9109.49; 2 threads: 4951.20; 4 threads: 2511.01; 8 threads: 2861.75 --- , 8 ;
(3), (2), 100 100 : 1 thread: 9078.02; 2 threads: 4956.66; 4 threads: 2516.93; 8 threads: 2088.88 --- (2), 8 .
, openmp + SIMD, / , , -, , src dst , .
?
3:
: (2) (3) openmp
#pragma omp parallel for private(vec_src, vec_op, vec_dst)
for (int k = 0; k < 100; ++k) {
for (int64_t i = 0; i < size; i += 8) {
......
}
}
. , . , (2) (3) .
, openmp :
for (int k = 0; k < 100; ++k) {
#pragma omp parallel for private(vec_src, vec_op, vec_dst)
for (int64_t i = 0; i < size; i += 8) {
......
}
}
: 1 thread: 9074.18; 2 threads: 8809.36; 4 threads: 8936.89.93; 8 threads: 9098.83.
.: (
-4:
( , ):
for (int64_t i = 0; i < size; i++) { // not i += 8
int query = src[i];
int res = src[i] + 2;
res = res * query;
res = res << 1;
res = res + query;
res = res - query;
dst[i] = res;
}
1 thread: 92.065; 2 threads: 89.432; 4 threads: 88.864. , ( /)? , /?