TBB acts weird in Matlab Mex file

Edited by: < Matlab restricts TBB, but not OpenMP > My question is different from the above, it is not duplicated, although the same code sample is used to illustrate. In my case, I defined multiple threads in tbb initialization instead of using "deferred". I am also talking about strange behavior between TBB in C ++ and TBB in mex. The answer to this question shows only the initialization of threads when starting TBB in C ++, and not in MEX.


I am trying to enlarge the matlab mex file to improve performance. The strange thing that I encounter when using TBB in mex is that initializing TBB does not work properly.

This C ++ program uses 100% cpu and has 15 TBB threads when doing this:

main.cpp

#include "tbb/parallel_for_each.h"
#include "tbb/task_scheduler_init.h"
#include <iostream>
#include <vector>
#include "mex.h"

struct mytask {
  mytask(size_t n)
    :_n(n)
  {}
  void operator()() {
    for (long i=0;i<10000000000L;++i) {}  // Deliberately run slow
    std::cerr << "[" << _n << "]";
  }
  size_t _n;
};

template <typename T> struct invoker {
  void operator()(T& it) const {it();}
};

void mexFunction(/* int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[] */) {

  tbb::task_scheduler_init init(15);  // 15 threads

  std::vector<mytask> tasks;
  for (int i=0;i<10000;++i)
    tasks.push_back(mytask(i));

  tbb::parallel_for_each(tasks.begin(),tasks.end(),invoker<mytask>());

}

int main()
{
    mexFunction();
}

Then I changed the code a bit to make MEX for matlab:

BuildMEX.mexw64

#include "tbb/parallel_for_each.h"
#include "tbb/task_scheduler_init.h"
#include <iostream>
#include <vector>
#include "mex.h"

struct mytask {
  mytask(size_t n)
    :_n(n)
  {}
  void operator()() {
    for (long i=0;i<10000000000L;++i) {}  // Deliberately run slow
    std::cerr << "[" << _n << "]";
  }
  size_t _n;
};

template <typename T> struct invoker {
  void operator()(T& it) const {it();}
};


void mexFunction( int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[] ) {

  tbb::task_scheduler_init init(15);  // 15 threads

  std::vector<mytask> tasks;
  for (int i=0;i<10000;++i)
    tasks.push_back(mytask(i));

  tbb::parallel_for_each(tasks.begin(),tasks.end(),invoker<mytask>());

}

Finally call BuildMEX.mexw64 in Matlab. I compiled (mcc) the following code snippet for the binary file “MEXtest.exe” in Matlab and used vTune to determine its performance (runs in MCR). TBB inside the process only initialized 4 tbb threads , and binary takes up only ~ 50% cpu . Why does MEX reduce overall performance and TBB? How can I use more processor for mex?

MEXtest.exe

function MEXtest()

BuildMEX();

end
+4
2

:

TBB . , concurrency . TBB, , .

initialize(), :

__ , task_scheduler_inits. task_scheduler_inits. , .

( , )

, MATLAB Intel TBB , , MEX- . , , MATLAB, , .

MATLAB ( ), , :

>> maxNumCompThreads
Warning: maxNumCompThreads will be removed in a future release [...]
ans =
     4

OpenMP , , :

#include <omp.h>
.. 
omp_set_dynamic(1);
omp_set_num_threads(omp_get_num_procs());

:

>> setenv('OMP_NUM_THREADS', '8')

, , :

test_tbb.cpp

#ifdef MATLAB_MEX_FILE
#include "mex.h"
#endif

#include <cstdlib>
#include <cstdio>
#include <vector>

#define WIN32_LEAN_AND_MEAN
#include <windows.h>

#include "tbb/task_scheduler_init.h"
#include "tbb/parallel_for_each.h"
#include "tbb/spin_mutex.h"

#include "tbb_helpers.hxx"

#define NTASKS 100
#define NLOOPS 400000L

tbb::spin_mutex print_mutex;

struct mytask {
    mytask(size_t n) :_n(n) {}
    void operator()()
    {
        // track maximum number of parallel workers run
        ConcurrencyProfiler prof;

        // burn some CPU cycles!
        double x = 1.0 / _n;
        for (long i=0; i<NLOOPS; ++i) {
            x = sin(x) * 10.0;
            while((double) rand() / RAND_MAX < 0.9);
        }
        {
            tbb::spin_mutex::scoped_lock s(print_mutex);
            fprintf(stderr, "%f\n", x);
        }
    }
    size_t _n;
};

template <typename T> struct invoker {
    void operator()(T& it) const { it(); }
};

void run()
{
    // use all 8 logical cores
    SetProcessAffinityMask(GetCurrentProcess(), 0xFF);

    printf("numTasks = %d\n", NTASKS);
    for (int t = tbb::task_scheduler_init::automatic;
         t <= 512; t = (t>0) ? t*2 : 1)
    {
        tbb::task_scheduler_init init(t);

        std::vector<mytask> tasks;
        for (int i=0; i<NTASKS; ++i) {
            tasks.push_back(mytask(i));
        }

        ConcurrencyProfiler::Reset();
        tbb::parallel_for_each(tasks.begin(), tasks.end(), invoker<mytask>());

        printf("pool_init(%d) -> %d worker threads\n", t,
            ConcurrencyProfiler::GetMaxNumThreads());
    }
}

#ifdef MATLAB_MEX_FILE
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[])
{
    run();
}
#else
int main()
{
    run();
    return 0;
}
#endif

, concurrency, , . Intel VTune :

tbb_helpers.hxx

#ifndef HELPERS_H
#define HELPERS_H

#include "tbb/atomic.h"

class ConcurrencyProfiler
{
public:
    ConcurrencyProfiler();
    ~ConcurrencyProfiler();
    static void Reset();
    static size_t GetMaxNumThreads();
private:
    static void RecordMax();
    static tbb::atomic<size_t> cur_count;
    static tbb::atomic<size_t> max_count;
};

#endif

tbb_helpers.cxx

#include "tbb_helpers.hxx"

tbb::atomic<size_t> ConcurrencyProfiler::cur_count;
tbb::atomic<size_t> ConcurrencyProfiler::max_count;

ConcurrencyProfiler::ConcurrencyProfiler()
{
    ++cur_count;
    RecordMax();
}

ConcurrencyProfiler::~ConcurrencyProfiler()
{
    --cur_count;
}

void ConcurrencyProfiler::Reset()
{
    cur_count = max_count = 0;
}

size_t ConcurrencyProfiler::GetMaxNumThreads()
{
    return static_cast<size_t>(max_count);
}

// Performs: max_count = max(max_count,cur_count)
// http://www.threadingbuildingblocks.org/
//    docs/help/tbb_userguide/Design_Patterns/Compare_and_Swap_Loop.htm
void ConcurrencyProfiler::RecordMax()
{
    size_t o;
    do {
        o = max_count;
        if (o >= cur_count) break;
    } while(max_count.compare_and_swap(cur_count,o) != o);
}

( Intel ++ Composer XE 2013 SP1, VS2012 4):

C:\> vcvarsall.bat amd64
C:\> iclvars.bat intel64 vs2012
C:\> icl /MD test_tbb.cpp tbb_helpers.cxx tbb.lib

(Windows 8.1). 100% , :

C:\> test_tbb.exe 2> nul
numTasks = 100
pool_init(-1) -> 8 worker threads          // task_scheduler_init::automatic
pool_init(1) -> 1 worker threads
pool_init(2) -> 2 worker threads
pool_init(4) -> 4 worker threads
pool_init(8) -> 8 worker threads
pool_init(16) -> 16 worker threads
pool_init(32) -> 32 worker threads
pool_init(64) -> 64 worker threads
pool_init(128) -> 98 worker threads
pool_init(256) -> 100 worker threads
pool_init(512) -> 98 worker threads

, , , ( 512 100 !).

MEX :

>> mex -I"C:\Program Files (x86)\Intel\Composer XE\tbb\include" ...
   -largeArrayDims test_tbb.cpp tbb_helpers.cxx ...
   -L"C:\Program Files (x86)\Intel\Composer XE\tbb\lib\intel64\vc11" tbb.lib

, , MEX- MATLAB:

>> test_tbb()
numTasks = 100
pool_init(-1) -> 4 worker threads
pool_init(1) -> 4 worker threads
pool_init(2) -> 4 worker threads
pool_init(4) -> 4 worker threads
pool_init(8) -> 4 worker threads
pool_init(16) -> 4 worker threads
pool_init(32) -> 4 worker threads
pool_init(64) -> 4 worker threads
pool_init(128) -> 4 worker threads
pool_init(256) -> 4 worker threads
pool_init(512) -> 4 worker threads

, , , 4 (4 - ). , .

, 8 , 4 , 50%.

, , :)

+1

, 4 , MATLAB, , . , MATLAB, , MATLAB, MATLAB Compiler. , MEX MATLAB. reset , TBB, , TBB , .

TBB 3.0 4, , :

, , TBB , , , , , , , voilà, TBB , ! , TBB 3.0 Update 4. TBBs tbb::task_scheduler_init::default_num_threads() tbb::tbb_thread::hardware_concurrency() , , , .

, tbb::default_num_threads :

TBB 3.0 U4 CPU . Windows, Linux FreeBSD , , .

tbb::task_scheduler_init::initialize , .

, affinity, Windows.NET:

numCoresInSystem = 16;
proc = System.Diagnostics.Process.GetCurrentProcess();
dec2bin(proc.ProcessorAffinity.ToInt32,numCoresInSystem)

, ( ) .

MATLAB C, Q & A, MATLAB (Windows 7). MATLAB:

proc = System.Diagnostics.Process.GetCurrentProcess();
proc.ProcessorAffinity = System.IntPtr(int32(2^numCoresInSystem-1));
proc.Refresh()

API Windows, mexFunction, task_scheduler_init:

SetProcessAffinityMask(GetCurrentProcess(),(1 << N) - 1)

* nix taskset:

system(sprintf('taskset -p %d %d',2^N - 1,feature('getpid')))
+1

Source: https://habr.com/ru/post/1544936/


All Articles