Negative acceleration when multithreading my program

On my laptop with an Intel Pentium T2370 dual-core processor (Acer Extensa), I conducted a simple multi-threaded acceleration test. I am using Linux. The code will be inserted below. Although I expected an acceleration of 2-3 times, I was surprised to see a deceleration of 2 times. I tried the same with the gcc optimization levels -O0 ... -O3, but each time I got the same result. I am using pthreads. I also tried the same thing with only two threads (instead of 3 threads in the code), but the performance was similar.

What could be the reason? The faster version took a lot of time - about 20 seconds - so it seems that this is not a problem of launch overhead.

NOTE. This code is a lot of erroneous (indeed, it does not make much sense, since the output of serial and parallel versions will differ) The goal was to simply “get” the acceleration comparison for the same number of instructions.

#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <pthread.h>

class Thread{
    private:
            pthread_t thread;
            static void *thread_func(void *d){((Thread *)d)->run();}
    public:
            Thread(){}
            virtual ~Thread(){}

            virtual void run(){}
            int start(){return pthread_create(&thread, NULL, Thread::thread_func, (void*)this);}
            int wait(){return pthread_join(thread, NULL);}
};


#include <iostream>

const int ARR_SIZE = 100000000;
const int N = 20;
int arr[ARR_SIZE];

int main(void)
{

    class Thread_a:public Thread{
            public:
                    Thread_a(int* a): arr_(a) {}
                    void run()
                    {
                            for(int n = 0; n<N; n++)
                            for(int i=0; i<ARR_SIZE/3; i++){ arr_[i] += arr_[i-1];}
                    }
            private:
                    int* arr_;
    };
    class Thread_b:public Thread{
            public:
                    Thread_b(int* a): arr_(a) {}
                    void run()
                    {
                            for(int n = 0; n<N; n++)
                            for(int i=ARR_SIZE/3; i<2*ARR_SIZE/3; i++){ arr_[i] += arr_[i-1];}
                    }
            private:
                    int* arr_;
    };

    class Thread_c:public Thread{
            public:
                    Thread_c(int* a): arr_(a) {}
                    void run()
                    {
                            for(int n = 0; n<N; n++)
                            for(int i=2*ARR_SIZE/3; i<ARR_SIZE; i++){ arr_[i] += arr_[i-1];}
                    }
            private:
                    int* arr_;
    };

    {
            Thread *a=new Thread_a(arr);
            Thread *b=new Thread_b(arr);
            Thread *c=new Thread_c(arr);

            clock_t start = clock();

            if (a->start() != 0) {
                    return 1;
            }

            if (b->start() != 0) {
                    return 1;
            }
            if (c->start() != 0) {
                    return 1;
            }

            if (a->wait() != 0) {
                    return 1;
            }

            if (b->wait() != 0) {
                    return 1;
            }

            if (c->wait() != 0) {
                    return 1;
            }

            clock_t end = clock();
            double duration = (double)(end - start) / CLOCKS_PER_SEC;

            std::cout << duration << "seconds\n";
            delete a;
            delete b;

    }
    {
            clock_t start = clock();
            for(int n = 0; n<N; n++)
            for(int i=0; i<ARR_SIZE; i++){ arr[i] += arr[i-1];}
            clock_t end = clock();
            double duration = (double)(end - start) / CLOCKS_PER_SEC;

            std::cout << "serial: " << duration << "seconds\n";
    }

    return 0;
  }

See also: What can slow down a program when using more threads?

+3
source share
8 answers

The time you report is measured using the clock function:

The function clock()returns the approximate processor time used by the program.

$ time bin/amit_kumar_threads.cpp
6.62seconds
serial: 2.7seconds

real    0m5.247s
user    0m9.025s
sys 0m0.304s

Real time will be less for multiprocessor tasks, but CPU time will usually be longer.

, , , , , , . clock() , + . .

, , , , clock(), , , , , .

clock_gettime() ( ltime, g++ -lrt ..):

$ time bin/amit_kumar_threads.cpp
2.524 seconds
serial: 2.761 seconds

real    0m5.326s
user    0m9.057s
sys 0m0.344s

- , , , , .

100000000 * 20/2.5s = 800 , 1600 , , ( ), , tstenner, clock() , . (- , clock() ?)

+16

, , , IO. , 2 , , , ..

+6

, - .

, , () . , , , , . , , ( ) ( ), . , - , , .

+4

, . :

for(int i=0; i<ARR_SIZE; i++){ arr[i] += arr[i-1];}

i ,

arr[0] += arr[-1];
+1
+1

(TM), . , :

  • , .

. , , ...

.

, . , , , , .

0

, . , , , .

0

tstenner .

, , " " . 800 ; , . " " , .

( 800 , - , , - 800 * 7 ). , .

3 , . , L1 . (-, , L1, L1, , ). . , , , , , .

, , . .

0

Source: https://habr.com/ru/post/1705349/


All Articles