Effective handling of tuples as fixed size vectors

In the Chapel, uniform tuples can be used as if they were small "vectors" (for example, a = b + c * 3.0 + 5.0;).

However, since various mathematical functions are not provided for tuples, I tried to write a function in norm()several ways and compared their performance. My code looks something like this:

proc norm_3tuple( x: 3*real ): real
{
    return sqrt( x[1]**2 + x[2]**2 + x[3]**2 );
}

proc norm_loop( x ): real
{
    var tmp = 0.0;
    for i in 1 .. x.size do
        tmp += x[i]**2;
    return sqrt( tmp );
}

proc norm_loop_param( x ): real
{
    var tmp = 0.0;
    for param i in 1 .. x.size do
        tmp += x[i]**2;
    return sqrt( tmp );
}

proc norm_reduce( x ): real
{
    var tmp = ( + reduce x**2 );
    return sqrt( tmp );
}

//.........................................................

var a = ( 1.0, 2.0, 3.0 );

// consistency check
writeln( norm_3tuple(     a ) );
writeln( norm_loop(       a ) );
writeln( norm_loop_param( a ) );
writeln( norm_reduce(     a ) );

config const nloops = 100000000;  // 1E+8

var res = 0.0;
for k in 1 .. nloops
{
    a[ 1 ] = (k % 5): real;

    res += norm_3tuple(     a );
 // res += norm_loop(       a );
 // res += norm_loop_param( a );
 // res += norm_reduce(     a );
}

writeln( "result = ", res );

chpl --fast test.chpl (Chapel v1.16 OSX10.11 4 , ). norm_3tuple(), norm_loop() norm_loop_param() (0,45 ), norm_reduce() ( 30 ). top, norm_reduce() 4 , 1 . ...

  • norm_reduce() , reduce , ?
  • , reduce 3- , . , for-loops 3- (, --fast)?
  • norm_loop_param() param , . , param ( )?

, / . !

+4
2

norm_reduce() , reduce , , ?

, , , . , Chapel - , parallelism, ( ), , , ( , ... , ). , , .

, reduce 3- , . , for -loops 3- (, --fast)?

Chapel norm_loop() ( , , --savec), , back-end . for-loop norm_loop_param(). , , , . , Back-end C , , - , , 3- .

norm_loop_param() param , . , param ( )?

, , , C-.

+2

ex-post: .


?
Benchmark! ... , ,

chapel, . Chapel HPC .

[PARALLEL] , , -, "".

norm_reduce() concurrency -enabled reduce x**2 + - . 2 CLK-, ?

Amdahl .


- :

+++++++++++++++++++++++++++++++++++++++++++++++ <TiO.IDE>.RUN
                                        3.74166
[SEQ]       norm_loop():    0.0 [us] -- 3.74166
[SEQ] norm_loop_param():    0.0 [us] -- 3.74166
[PAR]:    norm_reduce(): 5677.0 [us] -- 3.74166

                                        3.74166
[SEQ]       norm_loop():    0.0 [us] -- 3.74166
[SEQ] norm_loop_param():    1.0 [us] -- 3.74166
[PAR]:    norm_reduce(): 5818.0 [us] -- 3.74166

                                        3.74166
[SEQ]       norm_loop():    1.0 [us] -- 3.74166
[SEQ] norm_loop_param():    2.0 [us] -- 3.74166
[PAR]:    norm_reduce(): 4886.0 [us] -- 3.74166

, , --fast:

+++++++++++++++++++++++++++++++++++++++++++++++ <TiO.IDE>.+CompilerFLAG( "--fast" ).RUN
                                        3.74166
[SEQ]       norm_loop():    1.0 [us] -- 3.74166
[SEQ] norm_loop_param():    2.0 [us] -- 3.74166
[PAR]:    norm_reduce(): 7769.0 [us] -- 3.74166

                                        3.74166
[SEQ]       norm_loop():    0.0 [us] -- 3.74166
[SEQ] norm_loop_param():    0.0 [us] -- 3.74166
[PAR]:    norm_reduce(): 9109.0 [us] -- 3.74166

                                        3.74166
[SEQ]       norm_loop():    1.0 [us] -- 3.74166
[SEQ] norm_loop_param():    1.0 [us] -- 3.74166
[PAR]:    norm_reduce(): 8807.0 [us] -- 3.74166

, SuperComputing2017 HPC [ ] , .

- Try-it-Online , / Chapel-, ( , TiO.IDE).

/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ use Time;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_SEQ: Timer;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_PAR: Timer;

proc norm_3tuple( x: 3*real ): real
{
    return sqrt( x[1]**2 + x[2]**2 + x[3]**2 );
}

proc norm_loop( x ): real
{
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.start();
    var tmp = 0.0;
    for i in 1 .. x.size do
        tmp += x[i]**2;
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.stop(); write(                          "[SEQ]       norm_loop(): ",
                                                                       aStopWATCH_SEQ.elapsed( Time.TimeUnits.microseconds ), " [us] -- " );
    return sqrt( tmp );
}

proc norm_loop_param( x ): real
{
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.start();
    var tmp = 0.0;
    for param i in 1 .. x.size do
        tmp += x[i]**2;
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.stop(); write(                          "[SEQ] norm_loop_param(): ",
                                                                       aStopWATCH_SEQ.elapsed( Time.TimeUnits.microseconds ), " [us] -- " );
    return sqrt( tmp );
}

proc norm_reduce( x ): real
{
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_PAR.start();
    var tmp = ( + reduce x**2 );
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_PAR.stop(); write(                          "[PAR]:    norm_reduce(): ",
                                                                       aStopWATCH_PAR.elapsed( Time.TimeUnits.microseconds ), " [us] -- " );
    return sqrt( tmp );
}

//.........................................................

var a = ( 1.0, 2.0, 3.0 );

// consistency check
writeln( norm_3tuple(     a ) );
writeln( norm_loop(       a ) );
writeln( norm_loop_param( a ) );
writeln( norm_reduce(     a ) );

:

 [LOOP] norm_3tuple():       45829.0 [us] -- result = 4.30918e+06 @   1000000 loops.
 [LOOP] norm_3tuple():      241680   [us] -- result = 4.30918e+07 @  10000000 loops.
 [LOOP] norm_3tuple():     2387080   [us] -- result = 4.30918e+08 @ 100000000 loops.

[LOOP]  norm_loop():         72160.0 [us] -- result = 4.30918e+06 @   1000000 loops.
[LOOP]  norm_loop():        755959   [us] -- result = 4.30918e+07 @  10000000 loops.
[LOOP]  norm_loop():       7783740   [us] -- result = 4.30918e+08 @ 100000000 loops.

[LOOP]  norm_loop_param():   34102.0 [us] -- result = 4.30918e+06 @   1000000 loops.
[LOOP]  norm_loop_param():  365510   [us] -- result = 4.30918e+07 @  10000000 loops.
[LOOP]  norm_loop_param(): 3480310   [us] -- result = 4.30918e+08 @ 100000000 loops.

-------------------------------------------------------------------------1000--------{--fast}---------------------------------------------------------------------
[LOOP]  norm_reduce():     5851380   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     5884600   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     6163690   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     6029860   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     6083730   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     6132720   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     6012620   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     6379020   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     5923550   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     6144660   [us] -- result = 4309.18     @      1000 loops.
[LOOP]  norm_reduce():     8098380   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     6215470   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     5831670   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     6124580   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     6092740   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     5811260   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     5880400   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     5898520   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     6591110   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     5876570   [us] -- result = 4309.18     @      1000 loops. [--fast]
[LOOP]  norm_reduce():     6034180   [us] -- result = 4309.18     @      1000 loops. [--fast]


-------------------------------------------------------------------------2000--------{--fast}---------------------------------------------------------------------
[LOOP]  norm_reduce():    12434700   [us] -- result = 8618.36     @      2000 loops.


-------------------------------------------------------------------------3000--------{--fast}---------------------------------------------------------------------
[LOOP]  norm_reduce():    17807600   [us] -- result = 12927.5     @      3000 loops.


-------------------------------------------------------------------------4000--------{--fast}---------------------------------------------------------------------
[LOOP]  norm_reduce():    23844300   [us] -- result = 17236.7     @      4000 loops.


-------------------------------------------------------------------------5000--------{--fast}---------------------------------------------------------------------
[LOOP]  norm_reduce():    30557700   [us] -- result = 21545.9     @      5000 loops.
[LOOP]  norm_reduce():    30523700   [us] -- result = 21545.9     @      5000 loops.
[LOOP]  norm_reduce():    29404200   [us] -- result = 21545.9     @      5000 loops.
[LOOP]  norm_reduce():    29268600   [us] -- result = 21545.9     @      5000 loops. [--fast]
[LOOP]  norm_reduce():    29009500   [us] -- result = 21545.9     @      5000 loops. [--fast]
[LOOP]  norm_reduce():    30388800   [us] -- result = 21545.9     @      5000 loops. [--fast]


-------------------------------------------------------------------------6000--------{--fast}---------------------------------------------------------------------
[LOOP]  norm_reduce():    37070600   [us] -- result = 25855.1     @      6000 loops.


-------------------------------------------------------------------------7000--------{--fast}---------------------------------------------------------------------
[LOOP]  norm_reduce():    42789200   [us] -- result = 30164.3     @      7000 loops.


---------------------------------------------------------------------8000--------{--fast}---------------------------------------------------------------------
[LOOP]  norm_reduce():    50572700   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    49944300   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    49365600   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():   ~60+                                                                 // exceeded the 60 seconds limit and was terminated [Exit code: 124]
[LOOP]  norm_reduce():    50099900   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    49445500   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    49783800   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    48533400   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    48966600   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    47564700   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    47087400   [us] -- result = 34473.4     @      8000 loops.
[LOOP]  norm_reduce():    47624300   [us] -- result = 34473.4     @      8000 loops. [--fast]
[LOOP]  norm_reduce():   ~60+                                                        [--fast] // exceeded the 60 seconds limit and was terminated [Exit code: 124]
[LOOP]  norm_reduce():   ~60+                                                        [--fast] // exceeded the 60 seconds limit and was terminated [Exit code: 124]
[LOOP]  norm_reduce():    46887700   [us] -- result = 34473.4     @      8000 loops. [--fast]
[LOOP]  norm_reduce():    46571800   [us] -- result = 34473.4     @      8000 loops. [--fast]
[LOOP]  norm_reduce():    46794700   [us] -- result = 34473.4     @      8000 loops. [--fast]
[LOOP]  norm_reduce():    46862600   [us] -- result = 34473.4     @      8000 loops. [--fast]
[LOOP]  norm_reduce():    47348700   [us] -- result = 34473.4     @      8000 loops. [--fast]
[LOOP]  norm_reduce():    46669500   [us] -- result = 34473.4     @      8000 loops. [--fast]

- forall do { ... }:

[SEQ] - nloops , , ( ) , --fast:

/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ use Time;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_LOOP: Timer;

config const nloops = 100000000;  // 1E+8    
       var   res: atomic real;
             res.write( 0.0 );
//------------------------------------------------------------------// PRE-COMPUTE:
var A1:    [1 .. nloops] real;                                      // pre-compute a tuple-element value
forall k in 1 .. nloops do                                          // pre-compute a tuple-element value
    A1[k] = (k % 5): real;                                          // pre-compute a tuple-element value to a ( k % 5 ), ex-post typecast to real

/* ---------------------------------------------SECTION-UNDER-TEST--*/  aStopWATCH_LOOP.start();
forall i in 1 .. nloops do
{               //  a[1] = (  i % 5 ): real;                        // pre-compute'd
   res.add( norm_reduce( ( A1[i],            a[1], a[2] ) ) );      //     atomic.add()
// res +=   norm_reduce( ( (  i % 5 ): real, a[1], a[2] ) );        // non-atomic
//:49: note: The shadow variable 'res' is constant due to forall intents in this loop

}/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_LOOP.stop(); write(
  "forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: ",     aStopWATCH_LOOP.elapsed( Time.TimeUnits.microseconds ), " [us] -- " );
/* 
   --------------------------------------------------------------------------------------------------------{-nloops-}-------{--fast}-------------
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:     7911.0 [us] -- result =     320.196 @       100 loops. 
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:     8055.0 [us] -- result =    3201.96  @      1000 loops.
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:     8002.0 [us] -- result =   32019.6   @     10000 loops.
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:    80685.0 [us] -- result = 3.20196e+05 @    100000 loops.
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:   842948   [us] -- result = 3.20196e+06 @   1000000 loops.
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  8005300   [us] -- result = 3.20196e+07 @  10000000 loops.
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 40358900   [us] -- result = 1.60098e+08 @  50000000 loops.
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }: 40671200   [us] -- result = 1.60098e+08 @  50000000 loops.

   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  2195000   [us] -- result = 1.60098e+08 @  50000000 loops. [--fast]

   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  4518790   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  6178440   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  4755940   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  4405480   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  4509170   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  4736110   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  4653610   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  4397990   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
   forall .. do { res.add( norm_reduce( aPreComputedTUPLE ) ) }:  4655240   [us] -- result = 3.20196e+08 @ 100000000 loops. [--fast]
  */
+1

Source: https://habr.com/ru/post/1689087/


All Articles