Exciting. This seems to be really a performance / optimization issue. Loop # 1 uses MMX codes:
for(unsigned int i = 0; i < N; ++i) {
res += *p++ + *q++ + *r++;
00C812C0 lea eax,[eax+20h]
00C812C3 lea esi,[esi+20h]
00C812C6 lea edx,[edx+20h]
00C812C9 movups xmm0,xmmword ptr [esi-20h]
00C812CD movups xmm1,xmmword ptr [eax-20h]
00C812D1 paddd xmm1,xmm0
00C812D5 movups xmm0,xmmword ptr [edx-20h]
00C812D9 paddd xmm1,xmm0
00C812DD paddd xmm3,xmm1
00C812E1 movups xmm0,xmmword ptr [esi-10h]
00C812E5 movups xmm1,xmmword ptr [eax-10h]
00C812E9 paddd xmm1,xmm0
00C812ED movups xmm0,xmmword ptr [edx-10h]
00C812F1 paddd xmm1,xmm0
00C812F5 paddd xmm2,xmm1
00C812F9 sub ecx,1
00C812FC jne
while Loop # 2 uses standard opcodes:
for(unsigned int i = 0; i < N; ++i) {
res += *d.px++ + *d.py++ + *d.pz++;
00C81340 mov eax,dword ptr [edi]
00C81342 lea ecx,[ecx+14h]
00C81345 add eax,dword ptr [esi]
00C81347 lea edi,[edi+14h]
00C8134A add eax,dword ptr [ecx-14h]
00C8134D lea esi,[esi+14h]
00C81350 mov edx,dword ptr [edi-4]
00C81353 add ebx,eax
00C81355 mov eax,dword ptr [edi-10h]
00C81358 add eax,dword ptr [esi-10h]
00C8135B add eax,dword ptr [ecx-10h]
00C8135E add edx,dword ptr [esi-4]
for(unsigned int i = 0; i < N; ++i) {
res += *d.px++ + *d.py++ + *d.pz++;
00C81361 add ebx,eax
00C81363 mov eax,dword ptr [edi-0Ch]
00C81366 add eax,dword ptr [esi-0Ch]
00C81369 add eax,dword ptr [ecx-0Ch]
00C8136C add edx,dword ptr [ecx-4]
00C8136F add ebx,eax
00C81371 mov eax,dword ptr [edi-8]
00C81374 add eax,dword ptr [esi-8]
00C81377 add eax,dword ptr [ecx-8]
00C8137A add ebx,eax
00C8137C add ebx,edx
00C8137E sub dword ptr [ebp-4],1
00C81382 jne <lambda_90dad2b8e1a29d982a00cc5b5b0ef516>::operator()+20h (0C81340h)
X86/MMX, , , MMX .
, . , , , int*
, auto
const int*
. const .