++ 14 , ; vectorized_op
20% , unvectorized_op
, 200% .
g++ - 4.9.2/clang++ - 3.5.0 x86: 64 Debian Jessie
-std=c++14 -O3 -Wall -pedantic -msse4.1 -ftree-vectorize -ffast-math
(resp + = -stdlib = lib++ clang)
()
#include <iterator>
#include <algorithm>
#include <valarray>
#include <array>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <ctime>
#include <fstream>
using std::size_t;
using std::endl;
auto& Log = std::cout;
using real_type = float;
using ra_type = std::valarray<real_type>;
using ia_type = std::valarray<size_t>;
using workload_type = unsigned char;
using ca_type = std::valarray<workload_type>;
static constexpr bool BUILD_PIVOTS_H = false;
static constexpr size_t B_SIZE = 4;
static constexpr size_t MAX_CHAR = 256;
static constexpr size_t PRECOMPUTED_SIZE = MAX_CHAR * B_SIZE;
static std::array<workload_type, PRECOMPUTED_SIZE> PRECOMPUTED __attribute__ ((aligned(16)));
static std::array<real_type, PRECOMPUTED_SIZE> LUT __attribute__ ((aligned(16)));
struct sentry {
sentry(const std::string& s) {
Log << s << endl;
}
};
void unvectorized_op(ra_type& A, ra_type&B, const ca_type& data) {
static sentry _("unvectorized_op:");
int k = 0;
for(size_t j = 0; j < data.size(); ++j) {
char x = data[j];
#ifdef MOCK
for (auto& i : {0,1,2,3})
A[k++] = real_type(0);
#else
for(int s = 0; s < 4; ++s) {
A[k++] += B[x&3];
x >>= 2;
}
#endif
}
}
void vectorized_op(ra_type& A, ra_type& B, const ca_type& data) {
static sentry _("vectorized_op:");
ra_type bflat(A.size());
auto beg = std::begin(bflat);
std::for_each(std::begin(data), std::end(data),
[&beg] (const workload_type& x) {
const auto& it = &LUT[(x)*4];
#ifdef MOCK
for (auto& i : {0,1,2,3})
*(beg++) = real_type(0);
#else
std::for_each(it, it + 4,
[&beg] (const real_type& c) {*beg = c; ++beg;}
);
#endif
}
);
A = bflat;
}
int main (int argc, char* argv[]) {
int magnitude = -1;
if (argc > 1) {
try {
magnitude = std::stoi(argv[1]);
} catch (const std::runtime_error& e ){
}
}
if ((magnitude < 1) || (magnitude > 10)) {
Log << "no magnitude between 2 and 8 given; choosing 5\n";
magnitude = 5;
}
{
size_t i = 0;
workload_type x = 0;
std::generate(std::begin(PRECOMPUTED), std::end(PRECOMPUTED),
[&i, &x] () {
workload_type rv = x & 3;
x >>= 2;
++i;
(i % B_SIZE) == 0 ? x = i/B_SIZE : x >>= 2;
return rv;
}
);
}
ra_type B(B_SIZE);
::srand48(::clock());
std::generate(std::begin(B), std::end(B), ::drand48);
std::transform(std::begin(PRECOMPUTED), std::end(PRECOMPUTED),
std::begin(LUT),
[&B] (const workload_type& c) {
return B[c];
});
for (auto &f : {unvectorized_op, vectorized_op}) {
for (int i = 1; i < magnitude; ++i) {
size_t workload_size = pow(10, i);
size_t repetitions = pow(10, magnitude - i);
ca_type data(workload_size);
std::generate(std::begin(data), std::end(data),
[] () {return workload_type(::drand48() * MAX_CHAR);});
ra_type A(0., workload_size * B_SIZE);
auto start = ::clock();
for (unsigned long j = 0; j < repetitions; ++j) {
A = 0.;
f(A, B, data);
}
auto duration = ::clock() - start;
Log << std::setw(8) << repetitions << " repetitions with a "
<< std::setw(8) << workload_size << " workload took "
<< double(duration)/CLOCKS_PER_SEC << "s"
<< endl;
}
}
if (BUILD_PIVOTS_H) {
try {
std::ofstream pivots_h("pivots.h");
pivots_h << "#ifndef PIVOTS_H_ \n\n";
pivots_h << "const char PIVOTS[256][4] = {\n";
for (auto i = 0; i < 256; ++i) {
pivots_h << " {";
for (auto j = 0; j < 4; ++j) {
pivots_h << long(PRECOMPUTED[i*4+j]) << (j < 3 ? ", " : "");
}
pivots_h << ("}") << (i < 255 ? ",\n" : "\n");
}
pivots_h << "} __attribute__ ((aligned(16)));\n";
pivots_h << "#endif\n" << std::endl;
} catch (const std::runtime_error& e) {
Log << "ERROR: " << e.what() << std::endl;
}
}
}
Fortran 95:
module setup
implicit none
logical GENERATE_PIVOTS
parameter(GENERATE_PIVOTS = .TRUE.)
integer, parameter :: CHAR_TYPE = 1
integer, parameter :: REAL_TYPE = 4
integer(CHAR_TYPE), dimension(0:255, 0:3) :: PIVOTS
real (REAL_TYPE), dimension(0:255, 0:3) :: LUT
real(REAL_TYPE), dimension(0:3) :: B
contains
subroutine init(B)
implicit none
real(REAL_TYPE), dimension(0:3), intent(in) :: B
integer :: r, c
if (GENERATE_PIVOTS) then
do r = lbound(PIVOTS,1),ubound(PIVOTS,1)
do c = lbound(PIVOTS,2),ubound(PIVOTS,2)
PIVOTS(r,c) = int(iand(ishft(r, -c), 3), kind=1)
end do
end do
else
!> @TODO load PIVOTS FROM file
end if
do r = 0,255
do c = 0,3
LUT = B(PIVOTS(r,c))
end do
end do
end subroutine init
end module setup
module testling
use setup
implicit none
contains
subroutine generate(A, data)
real(REAL_TYPE), dimension(:,:), intent(inout):: A
integer(CHAR_TYPE), dimension(:), intent(in) :: data
integer :: i
!A = LUT(data, :)
forall (i=lbound(A,1):ubound(A,1))
A(i,:) = LUT(data(i), :)
end forall
end subroutine generate
end module testling
subroutine do_test(workload_size, repetitions)
use setup
use testling
implicit none
integer, intent(in) :: workload_size
integer, intent(in) :: repetitions
real(REAL_TYPE), dimension(0:workload_size-1,0:3) :: A
integer(CHAR_TYPE), dimension(0:workload_size-1) :: data
integer(8) :: s, e, cr, cm
integer(4) :: i
real :: random
make_workload : do i = lbound(data,1),ubound(data,1)
call random_number(random)
data(i) = int(floor(random * 256),kind=CHAR_TYPE)
end do make_workload
call system_clock(s, cr, cm)
do i = 1,repetitions
call generate(A, data)
end do
call system_clock(e, cr, cm)
write(*,'(I8, " repetitions for a ",I8," workload took ",F12.10,"s")')&
REPETITIONS, WORKLOAD_SIZE, real(e - s,kind = 8)/cr
end subroutine do_test
program main
use setup
implicit none
integer(8) :: c, cr, cm
integer(4) :: i, seed
character(len=100) :: buffer
integer(1) :: magnitude = -1
integer :: workload_size
integer :: repetitions
if (iargc() > 0) then
call getarg(1, buffer)
read(buffer,*) magnitude
end if
if ((magnitude < 2) .or. (magnitude > 10)) then
write(*,*) "no magnitude between 2 and 8 given; choosing 5"
magnitude = 5
endif
call system_clock(c,cr,cm)
seed = int(c/cr, kind=4)
call random_seed(seed)
fill_B: do i = lbound(B,1),ubound(B,1)
call random_number(B(i))
end do fill_B
call init(B)
do i=1,magnitude-1
workload_size = 10**i
repetitions = 10**(magnitude - i)
call do_test(workload_size, repetitions)
end do
end program main
gfortran-4.9
-O3 -Wall -pedantic -msse4.1 -ftree-vectorize -ffast-math
10 10 .
++ Fortran < 1M O(n)
, 1M, O(n)
10M.
Fortran 40-50% , ++.
"" - ++ - , , ; 20% , .