Increased OpenMP Concurrent Field Operations with num_threads Increased

I tried to use a different number of threads in different parts of the program to achieve maximum acceleration. However, it has been found that switching over the thread number using the num_threads clause incurs significant overhead. I am looking for an explanation because, in my opinion, the thread pool should always contain a given number of threads, regardless of the actual number that was called. I also looked for possible workarounds against this. Thank.

Code example:

#include<cstdio>
#include<omp.h>

void omp_sum(int ntd) {
    int s = 0;
    #pragma omp parallel num_threads(ntd)
    {
        int i = omp_get_thread_num();
        #pragma omp atomic
        s += i;
    }
}   

int main()
{
    int N = 100;
    int NT1 = 6, NT2 = 12;
    double t;

    t = omp_get_wtime();
    for(int n=0;n<N;n++) {
        omp_sum(NT1);
    }
    printf("%lf\n", (omp_get_wtime() - t) * 1e6 );

    t = omp_get_wtime();
    for(int n=0;n<N;n++) {
        omp_sum(NT2);
    }
    printf("%lf\n", (omp_get_wtime() - t) * 1e6 );

    t = omp_get_wtime();
    for(int n=0;n<N;n++) {
        omp_sum(NT1);
        omp_sum(NT1);
    }
    printf("%lf\n", (omp_get_wtime() - t) * 1e6 );

    t = omp_get_wtime();
    for(int n=0;n<N;n++) {
        omp_sum(NT2);
        omp_sum(NT2);
    }
    printf("%lf\n", (omp_get_wtime() - t) * 1e6 );

    t = omp_get_wtime();
    for(int n=0;n<N;n++) {
        omp_sum(NT1);
        omp_sum(NT2);
    }
    printf("%lf\n", (omp_get_wtime() - t) * 1e6 );
}

Example output (in us):

1034.069001
1058.620000
1034.572000
2210.681000
18234.355000

: , , 2 Intel E5-2630L, 12 24 . Fedora 19 GCC 4.8.2.

+4
1

GCC 4.8 (g++ -O3 -fopenmp foo.cpp) / . N1 4 N2 8.

omp_sum

pushq   %rbx    
movq    %rdi, %rbx
call    omp_get_thread_num
movq    (%rbx), %rdx
lock addl   %eax, (%rdx)
popq    %rbx
ret

for(int n=0;n<N;n++) {
    omp_sum(NT1);
    omp_sum(NT2);
}

.L10
leaq    32(%rsp), %rsi
xorl    %ecx, %ecx
movl    $4, %edx
movl    $_Z7omp_sumi._omp_fn.0, %edi
movl    $0, 28(%rsp)
movq    %rbx, 32(%rsp)
call    GOMP_parallel
leaq    32(%rsp), %rsi
xorl    %ecx, %ecx
movl    $8, %edx
movl    $_Z7omp_sumi._omp_fn.0, %edi
movl    $0, 28(%rsp)
movq    %rbx, 32(%rsp)
call    GOMP_parallel
subl    $1, %ebp
jne .L10

for(int n=0;n<N;n++) {
    omp_sum(NT2);
    omp_sum(NT2);
}

: movl $4, %edx movl $8, %edx. , . GOMP_parallel. GOMP_parallel, , GOMP_parallel , , , . , .

, . ( , N ), .

: 2.41 OpenMP 3.1 " " . GOMP_parallel GCC-4.8 , , , - gomp_resolve_num_threads.

+2

Source: https://habr.com/ru/post/1546196/


All Articles