Without accelerating the use of openmp + SIMD

I am new to Openmp and am now trying to use the Openmp + SIMD features to speed up my program, but the result is far from expected.

To simplify things without losing essential information, I wrote an example of a simpler toy:

#include <omp.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <sys/time.h>

#include "immintrin.h" // for SIMD intrinsics

int main() {
    int64_t size = 160000000;
    std::vector<int> src(size);

    // generating random src data
    for (int i = 0; i < size; ++i)
        src[i] = (rand() / (float)RAND_MAX) * size;

    // to store the final results, so size is the same as src
    std::vector<int> dst(size);

    // get pointers for vector load and store
    int * src_ptr = src.data();
    int * dst_ptr = dst.data();

    __m256i vec_src;
    __m256i vec_op = _mm256_set1_epi32(2);
    __m256i vec_dst;

    omp_set_num_threads(4); // you can change thread count here

    // only measure the parallel part
    struct timeval one, two;
    double get_time;
    gettimeofday (&one, NULL);

    #pragma omp parallel for private(vec_src, vec_op, vec_dst)
    for (int64_t i = 0; i < size; i += 8) {
        // load needed data
        vec_src = _mm256_loadu_si256((__m256i const *)(src_ptr + i));

        // computation part
        vec_dst = _mm256_add_epi32(vec_src, vec_op);
        vec_dst = _mm256_mullo_epi32(vec_dst, vec_src);
        vec_dst = _mm256_slli_epi32(vec_dst, 1);
        vec_dst = _mm256_add_epi32(vec_dst, vec_src);
        vec_dst = _mm256_sub_epi32(vec_dst, vec_src);

        // store results
        _mm256_storeu_si256((__m256i *)(dst_ptr + i), vec_dst);
    }

    gettimeofday(&two, NULL);
    double oneD = one.tv_sec + (double)one.tv_usec * .000001;
    double twoD = two.tv_sec + (double)two.tv_usec * .000001;
    get_time = 1000 * (twoD - oneD);
    std::cout << "took time: " << get_time << std::endl;

    // output something in case the computation is optimized out
    int64_t i = (int)((rand() / (float)RAND_MAX) * size);
    for (int64_t i = 0; i < size; ++i)
        std::cout << i << ": " << dst[i] << std::endl;

    return 0;
}

It compiles using icpc -g -std=c++11 -march=core-avx2 -O3 -qopenmp test.cpp -o testand measures the elapsed time of the parallel part. The result is as follows (the average value is selected from 5 runs each):

1 thread: 92.519

2 threads: 89.045

4 threads: 90.361

The calculations seem awkwardly parallel, since different threads can simultaneously load their necessary data, taking into account different indices, and the case is similar for writing results, but why are there no accelerations?

Additional Information:

  • I checked the build code with icpc -g -std=c++11 -march=core-avx2 -O3 -qopenmp -S test.cppand found the generated vectorized instructions;

  • , , , 60, , 1 -> 2 -> 4.

.

-1:

@JerryCoffin , Vtune. :

1-thread: Memory Bound: 6.5%, L1 Bound: 0.134, L3 Latency: 0.039

2-threads: Memory Bound: 18.0%, L1 Bound: 0.115, L3 Latency: 0.015

4-threads: Memory Bound: 21.6%, L1 Bound: 0.213, L3 Latency: 0.003

Intel 4770 25,6 / (23 /, Vtune) . . , , . ?

EDIT-2 ( , , ):

@PaulR @bazza. 3 . , 4 8 . :

(1) dst, : 1 thread: 91.922; 2 threads: 93.170; 4 threads: 93.868 --- ;

(2) (1), 100 100 : 1 thread: 9109.49; 2 threads: 4951.20; 4 threads: 2511.01; 8 threads: 2861.75 --- , 8 ;

(3), (2), 100 100 : 1 thread: 9078.02; 2 threads: 4956.66; 4 threads: 2516.93; 8 threads: 2088.88 --- (2), 8 .

, openmp + SIMD, / , , -, , src dst , .

?

3:

: (2) (3) openmp

#pragma omp parallel for private(vec_src, vec_op, vec_dst)
for (int k = 0; k < 100; ++k) {
    for (int64_t i = 0; i < size; i += 8) {
        ......
    }
}

. , . , (2) (3) .

, openmp :

for (int k = 0; k < 100; ++k) {
    #pragma omp parallel for private(vec_src, vec_op, vec_dst)
    for (int64_t i = 0; i < size; i += 8) {
        ......
    }
}

: 1 thread: 9074.18; 2 threads: 8809.36; 4 threads: 8936.89.93; 8 threads: 9098.83.

.: (

-4:

( , ):

#pragma omp parallel for
for (int64_t i = 0; i < size; i++) { // not i += 8
    int query = src[i];
    int res = src[i] + 2;
    res = res * query;
    res = res << 1;
    res = res + query;
    res = res - query;
    dst[i] = res;
}

1 thread: 92.065; 2 threads: 89.432; 4 threads: 88.864. , ( /)? , /?

+4
1

, ( /)? , /?

, , - . , . , .

. : icpc (16.0.3 17.0.1) "" , size constexpr. , :

res = res + query;
res = res - query;

, . , , . vmovntdq vmovdqu, , . vmovntdq . CPU , . , - - . , SIMD : , , , . , i7-4770 , , 2 ~ 85,8 58,0 1,5 . , , .

, 58 2 * 160000000 * 4 22,07 / ( ), , VTune. ( , , 85,8 , ). .

, - / . , 217.6 GFLOP/s ( , int ops), 3.2 G int/s. , , . , , , .

(2) (3). , . , , . , omp_get_wtime :

  double one, two;
#pragma omp parallel 
  {
    __m256i vec_src;
    __m256i vec_op = _mm256_set1_epi32(2);   
    __m256i vec_dst;

#pragma omp master
    one = omp_get_wtime();
#pragma omp barrier
    for (int kk = 0; kk < 100; kk++)
#pragma omp for
    for (int64_t i = 0; i < size; i += 8) {
        ...
    }
#pragma omp master
    {
      two = omp_get_wtime();
      std::cout << "took time: " << (two-one) * 1000 << std::endl;
    }
  }

: . , ​​ .

Edit: , VTune . . , VTune .

+3

Source: https://habr.com/ru/post/1672698/


All Articles