Let me assume that if you are going to use SIMD to create a point product, you are trying to find a way to work with multiple vectors at the same time. For example, using SSE, if you have four vectors and you want to take a point product with a fixed vector, then you order data like (xxxx), (yyyy), (zzzz), (wwww) and add each SSE vector and get The result of four point products at once. This will provide you with 100% efficiency (four times faster) and not limited to 4-component vectors, it is also 100% effective for n-component vectors. Here is an example that uses SSE.
#include <xmmintrin.h> #include <stdio.h> void dot4x4(float *aosoa, float *b, float *out) { __m128 vx = _mm_load_ps(&aosoa[0]); __m128 vy = _mm_load_ps(&aosoa[4]); __m128 vz = _mm_load_ps(&aosoa[8]); __m128 vw = _mm_load_ps(&aosoa[12]); __m128 brod1 = _mm_set1_ps(b[0]); __m128 brod2 = _mm_set1_ps(b[1]); __m128 brod3 = _mm_set1_ps(b[2]); __m128 brod4 = _mm_set1_ps(b[3]); __m128 dot4 = _mm_add_ps( _mm_add_ps(_mm_mul_ps(brod1, vx), _mm_mul_ps(brod2, vy)), _mm_add_ps(_mm_mul_ps(brod3, vz), _mm_mul_ps(brod4, vw))); _mm_store_ps(out, dot4); } int main() { float *aosoa = (float*)_mm_malloc(sizeof(float)*16, 16); /* initialize array to AoSoA vectors v1 =(0,1,2,3}, v2 = (4,5,6,7), v3 =(8,9,10,11), v4 =(12,13,14,15) */ float a[] = { 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15, }; for (int i=0; i<16; i++) aosoa[i] = a[i]; float *out = (float*)_mm_malloc(sizeof(float)*4, 16); float b[] = {1,1,1,1}; dot4x4(aosoa, b, out); printf("%f %f %f %f\n", out[0], out[1], out[2], out[3]); _mm_free(aosoa); _mm_free(out); }