I am asked to vectorize a large program. Before starting with a large program, I wanted to see the effect of vectorization in a separate case. To do this, I created two programs that should show the idea of an outstanding transformation. One with an array of structures (without vec) and an array structure (with vec). I expected Soa to be far ahead of AOS, but it is not.
measured program cycle A
for (int i = 0; i < NUM; i++) {
ptr[i].c = ptr[i].a + ptr[i].b;
}
full program:
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
#include <chrono>
using namespace std;
using namespace std::chrono;
struct myStruct {
double a, b, c;
};
#define NUM 100000000
high_resolution_clock::time_point t1, t2, t3;
int main(int argc, char* argsv[]) {
struct myStruct *ptr = (struct myStruct *) malloc(NUM * sizeof(struct myStruct));
for (int i = 0; i < NUM; i++) {
ptr[i].a = i;
ptr[i].b = 2 * i;
}
t1 = high_resolution_clock::now();
for (int i = 0; i < NUM; i++) {
ptr[i].c = ptr[i].a + ptr[i].b;
}
t2 = high_resolution_clock::now();
long dur = duration_cast<microseconds>( t2 - t1 ).count();
cout << "took "<<dur << endl;
double sum = 0;
for (int i = 0; i < NUM; i++) {
sum += ptr[i].c;
}
cout << "sum is "<< sum << endl;
}
measured program cycle B
#pragma simd
for (int i = 0; i < NUM; i++) {
C[i] = A[i] + B[i];
}
full program:
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
#include <omp.h>
#include <chrono>
using namespace std;
using namespace std::chrono;
#define NUM 100000000
high_resolution_clock::time_point t1, t2, t3;
int main(int argc, char* argsv[]) {
double *A = (double *) malloc(NUM * sizeof(double));
double *B = (double *) malloc(NUM * sizeof(double));
double *C = (double *) malloc(NUM * sizeof(double));
for (int i = 0; i < NUM; i++) {
A[i] = i;
B[i] = 2 * i;
}
t1 = high_resolution_clock::now();
#pragma simd
for (int i = 0; i < NUM; i++) {
C[i] = A[i] + B[i];
}
t2 = high_resolution_clock::now();
long dur = duration_cast<microseconds>( t2 - t1 ).count();
cout << "Aos "<<dur << endl;
double sum = 0;
for (int i = 0; i < NUM; i++) {
sum += C[i];
}
cout << "sum "<<sum;
}
I compile with
icpc vectorization_aos.cpp -qopenmp --std=c++11 -cxxlib=/lrz/mnt/sys.x86_64/compilers/gcc/4.9.3/
icpc (v16) compiled and executed on Intel (R) Xeon (R) processor E5-2697 v3 @ 2.60 GHz
A 300 , B 350 . A, ( )
-O3
#pragma simd . , -, .
:
- ? , ?
2 ? , , ?
/ , vecotrization , .