Visual Studio 2015 loop performance degradation when storing pointers in a structure

I found that the next cycle

std::vector<int> x,y,z;
...
unsigned int res=0;
auto p=x.data();
auto q=y.data();
auto r=z.data();
for(unsigned int i=0;i<N;++i){
  res+=*p++ +*q++ +*r++;
}

works almost twice as fast as this one, which just wraps pointers in a struct:

struct pdata{int *px,*py,*pz;};

unsigned int res=0;
pdata d{x.data(),y.data(),z.data()};
for(unsigned int i=0;i<N;++i){
  res+=*d.px++ +*d.py++ +*d.pz++;
}
return res;

Is this a known performance issue? The following are full measurements of program performance and performance for Visual C ++ 2015 in 32-bit (x86) release mode (default), Windows 7 64-bit, Intel Core i5-2520M @ 2.5GHz:

#include <algorithm>
#include <array>
#include <chrono>
#include <cmath>
#include <numeric> 

std::chrono::high_resolution_clock::time_point measure_start,measure_pause;

template<typename F>
double measure(F f)
{
  using namespace std::chrono;

  static const int              num_trials=10;
  static const milliseconds     min_time_per_trial(200);
  std::array<double,num_trials> trials;
  volatile decltype(f())        res; /* to avoid optimizing f() away */

  for(int i=0;i<num_trials;++i){
    int                               runs=0;
    high_resolution_clock::time_point t2;

    measure_start=high_resolution_clock::now();
    do{
      res=f();
      ++runs;
      t2=high_resolution_clock::now();
    }while(t2-measure_start<min_time_per_trial);
    trials[i]=duration_cast<duration<double>>(t2-measure_start).count()/runs;
  }
  (void)res; /* var not used warn */

  std::sort(trials.begin(),trials.end());
  return std::accumulate(
    trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4);
}

template<typename F>
double measure(unsigned int n,F f)
{
  double t=measure(f);
  return (t/n)*10E9;
}    

#include <iostream>
#include <vector>

int main()
{
  static const unsigned int N=100000;
  std::vector<int> x(N),y(N),z(N);

  for(int i=0;i<N;i++){
    x[i]=i;
    y[i]=i+1;
    z[i]=i+2;
  }

  std::cout<<measure(N,[&]{
    unsigned int res=0;
    auto p=x.data();
    auto q=y.data();
    auto r=z.data();
    for(unsigned int i=0;i<N;++i){
      res+=*p++ +*q++ +*r++;
    }
    return res;
  })<<",";

  std::cout<<measure(N,[&]{
    struct pdata{int *px,*py,*pz;};

    unsigned int res=0;
    pdata d{x.data(),y.data(),z.data()};
    for(unsigned int i=0;i<N;++i){
      res+=*d.px++ +*d.py++ +*d.pz++;
    }
    return res;
  })<<"\n";
}

Output

4.24541,7.44588

Thank,

+2
source share
3 answers

Exciting. This seems to be really a performance / optimization issue. Loop # 1 uses MMX codes:

for(unsigned int i = 0; i < N; ++i) {
res += *p++ + *q++ + *r++;
00C812C0  lea         eax,[eax+20h]  
00C812C3  lea         esi,[esi+20h]  
00C812C6  lea         edx,[edx+20h]  
00C812C9  movups      xmm0,xmmword ptr [esi-20h]  
00C812CD  movups      xmm1,xmmword ptr [eax-20h]  
00C812D1  paddd       xmm1,xmm0  
00C812D5  movups      xmm0,xmmword ptr [edx-20h]  
00C812D9  paddd       xmm1,xmm0  
00C812DD  paddd       xmm3,xmm1  
00C812E1  movups      xmm0,xmmword ptr [esi-10h]  
00C812E5  movups      xmm1,xmmword ptr [eax-10h]  
00C812E9  paddd       xmm1,xmm0  
00C812ED  movups      xmm0,xmmword ptr [edx-10h]  
00C812F1  paddd       xmm1,xmm0  
00C812F5  paddd       xmm2,xmm1  
00C812F9  sub         ecx,1  
00C812FC  jne         

while Loop # 2 uses standard opcodes:

for(unsigned int i = 0; i < N; ++i) {
res += *d.px++ + *d.py++ + *d.pz++;
00C81340  mov         eax,dword ptr [edi]  
00C81342  lea         ecx,[ecx+14h]  
00C81345  add         eax,dword ptr [esi]  
00C81347  lea         edi,[edi+14h]  
00C8134A  add         eax,dword ptr [ecx-14h]  
00C8134D  lea         esi,[esi+14h]  
00C81350  mov         edx,dword ptr [edi-4]  
00C81353  add         ebx,eax  
00C81355  mov         eax,dword ptr [edi-10h]  
00C81358  add         eax,dword ptr [esi-10h]  
00C8135B  add         eax,dword ptr [ecx-10h]  
00C8135E  add         edx,dword ptr [esi-4]  
for(unsigned int i = 0; i < N; ++i) {
res += *d.px++ + *d.py++ + *d.pz++;
00C81361  add         ebx,eax  
00C81363  mov         eax,dword ptr [edi-0Ch]  
00C81366  add         eax,dword ptr [esi-0Ch]  
00C81369  add         eax,dword ptr [ecx-0Ch]  
00C8136C  add         edx,dword ptr [ecx-4]  
00C8136F  add         ebx,eax  
00C81371  mov         eax,dword ptr [edi-8]  
00C81374  add         eax,dword ptr [esi-8]  
00C81377  add         eax,dword ptr [ecx-8]  
00C8137A  add         ebx,eax  
00C8137C  add         ebx,edx  
00C8137E  sub         dword ptr [ebp-4],1  
00C81382  jne         <lambda_90dad2b8e1a29d982a00cc5b5b0ef516>::operator()+20h (0C81340h)  

X86/MMX, , , MMX . , . , , , int* , auto const int*. const .

+1

, ++ .

f() g() g++ (7.2) -O2, .

unsigned int f(std::vector<int>& x, std::vector<int>& y, std::vector<int>& z)
{
    auto p=x.data();
    auto q=y.data();
    auto r=z.data();

    unsigned int res = 0;
    for(unsigned int i=0;i<N;++i){
        res+=*p++ + *q++ + *r++;
    }
    return res;
}

unsigned int g(std::vector<int>& x, std::vector<int>& y, std::vector<int>& z)
{
    struct pdata{int *px,*py,*pz;};

    unsigned int res=0;
    pdata d{x.data(),y.data(),z.data()};
    for(unsigned int i=0;i<N;++i){
        res += *d.px++ +*d.py++ +*d.pz++;
    }
    return res;
}

:

-, f(), :

f(std::vector<int, std::allocator<int> >&, std::vector<int, std::allocator<int> >&, std::vector<int, std::allocator<int> >&):
  mov r8, QWORD PTR [rdi]
  xor eax, eax
  mov rdi, QWORD PTR [rsi]
  mov rsi, QWORD PTR [rdx]
  xor edx, edx
.L2:
  mov ecx, DWORD PTR [rdi+rdx]
  add ecx, DWORD PTR [r8+rdx]
  add ecx, DWORD PTR [rsi+rdx]
  add rdx, 4
  add eax, ecx
  cmp rdx, 4000000
  jne .L2
  rep ret

:

g(std::vector<int, std::allocator<int> >&, std::vector<int, std::allocator<int> >&, std::vector<int, std::allocator<int> >&):
  mov r8, QWORD PTR [rdi]
  xor eax, eax
  mov rdi, QWORD PTR [rsi]
  mov rsi, QWORD PTR [rdx]
  xor edx, edx
.L6:
  mov ecx, DWORD PTR [rdi+rdx]
  add ecx, DWORD PTR [r8+rdx]
  add ecx, DWORD PTR [rsi+rdx]
  add rdx, 4
  add eax, ecx
  cmp rdx, 4000000
  jne .L6
  rep ret

Godbolt: https://godbolt.org/g/DBqhDh

MSVC. , .

+1

, - . , , , , std::vector .

restrict, , .

+1

Source: https://habr.com/ru/post/1692725/


All Articles