I wrote the following code to compare the performance of cache skips when working:
#include <chrono> #include <cstdint> #include <cstring> #include <iostream> // Avoiding using power of 2 because of possible performance degradation due to cache associativity? static const size_t ROW_SIZE = 600; static const size_t COL_SIZE = 600; static const size_t TEST_COUNT = 50; #define SAME_TYPES 1 #define INIT_TO_ONE 0 #if SAME_TYPES #define ARY_TYPE uint64_t #define RET_TYPE uint64_t #else #define ARY_TYPE uint32_t #define RET_TYPE uint64_t #endif RET_TYPE sum_array_rows(const ARY_TYPE *a, const size_t step) { RET_TYPE sum = 0; for (size_t row = 0; row < ROW_SIZE; row++) for (size_t col = 0; col < COL_SIZE; col++) sum += static_cast<RET_TYPE>(a[row * step + col]); return sum; } RET_TYPE sum_array_cols(const ARY_TYPE *a, const size_t step) { RET_TYPE sum = 0; for (size_t col = 0; col < COL_SIZE; col++) for (size_t row = 0; row < ROW_SIZE; row++) sum += static_cast<RET_TYPE>(a[row * step + col]); return sum; } int main() { #if INIT_TO_ONE ARY_TYPE *a = new ARY_TYPE[ROW_SIZE * COL_SIZE]; for (int i = 0; i < ROW_SIZE * COL_SIZE; i++) a[i] = 1; #else ARY_TYPE *a = new ARY_TYPE[ROW_SIZE * COL_SIZE](); #endif std::chrono::high_resolution_clock hrc; std::cout << "SAME_TYPES: " << SAME_TYPES << "\n"; std::cout << "INIT_TO_ONE: " << INIT_TO_ONE << "\n"; std::cout << "ROW_SIZE: " << ROW_SIZE << "\n"; std::cout << "COL_SIZE: " << COL_SIZE << "\n\n"; { RET_TYPE sum = 0; auto start = hrc.now(); for (int i = 0; i < TEST_COUNT; i++) sum = sum_array_rows(a, COL_SIZE); auto end = hrc.now(); std::cout << "Time taken: " << (end - start).count() / TEST_COUNT << "\n"; std::cout << "Sum: " << sum << "\n"; } // I've added this because I want to trash the cache // I'm not sure if this is necessary or if it doing any good... ARY_TYPE *other = new ARY_TYPE[ROW_SIZE * COL_SIZE]; for (int i = 0; i < ROW_SIZE * COL_SIZE; i++) other[i] = 1; { RET_TYPE sum = other[ROW_SIZE] - 1; auto start = hrc.now(); for (int i = 0; i < TEST_COUNT; i++) sum = sum_array_cols(a, COL_SIZE); auto end = hrc.now(); std::cout << "Time taken: " << (end - start).count() / TEST_COUNT << "\n"; std::cout << "Sum: " << sum << "\n"; } return 0; }
I have two functions sum_array_rows and sum_array_cols , which take into an array and add elements and return the sum. The difference is how we access the elements. The return type of both functions is always uint64_t .
I have a definition near the top of a file called SAME_TYPES . If SAME_TYPES , then both types of return type and element type are uint64_t . If not SAME_TYPES , then the element type is uint32_t .
So, running this code gives me ...
SAME_TYPES set to 1:
SAME_TYPES: 1 INIT_TO_ONE: 0 ROW_SIZE: 600 COL_SIZE: 600 Time taken: 80948 (element type is uint64_t, ROW first) Sum: 0 Time taken: 566241 (element type is uint64_t, COL first) Sum: 0
SAME_TYPES set to 0:
SAME_TYPES: 0 INIT_TO_ONE: 0 ROW_SIZE: 600 COL_SIZE: 600 Time taken: 283348 (element type is uint32_t, ROW first) Sum: 0 Time taken: 369653 (element type is uint32_t, COL first) Sum: 0
I am confused why this affects performance. When the data type is the same, the result seems to be expected, row-col is faster than col-row. But why, when the types are different, I see a decrease in the number of lines and an increase in the col-line. I understand that if the data type of the element is larger, then I can fit less in my cache, but then why do I see a performance increase when iterating through the columns in the outer loop. Can someone please explain this to me?
In case this is important, I use VS 2015 to compile this, and my processor is i5-4460 (works under Windows 10).
Edit: I think I should not blindly trust the compiler. I originally compiled using /02 , which is the default setting. When I compile without optimization, I get the expected behavior:
SAME_TYPES set to 1:
SAME_TYPES: 1 INIT_TO_ONE: 0 ROW_SIZE: 600 COL_SIZE: 600 Time taken: 1009518 (element type is uint64_t, ROW first) Sum: 0 Time taken: 1504863 (element type is uint64_t, COL first) Sum: 0
SAME_TYPES set to 0:
SAME_TYPES: 0 INIT_TO_ONE: 0 ROW_SIZE: 600 COL_SIZE: 600 Time taken: 909479 (element type is uint32_t, ROW first) Sum: 0 Time taken: 1244492 (element type is uint32_t, COL first) Sum: 0
The data type still has an effect, but now it seems reasonable. Here is the build when compiling with optimization on:
Optimisation ... /O2 SAME_TYPE ...... 1 RET_TYPE sum_array_rows(const ARY_TYPE *a, const size_t step) { 00FD10C0 xorps xmm2,xmm2 RET_TYPE sum = 0; 00FD10C3 lea eax,[ecx+10h] 00FD10C6 movaps xmm1,xmm2 00FD10C9 mov edx,258h 00FD10CE xchg ax,ax for (size_t col = 0; col < COL_SIZE; col++) 00FD10D0 mov ecx,96h sum += static_cast<RET_TYPE>(a[row * step + col]); 00FD10D5 movups xmm0,xmmword ptr [eax-10h] 00FD10D9 paddq xmm2,xmm0 00FD10DD movups xmm0,xmmword ptr [eax] 00FD10E0 add eax,20h 00FD10E3 paddq xmm1,xmm0 00FD10E7 sub ecx,1 00FD10EA jne sum_array_rows+15h (0FD10D5h) for (size_t row = 0; row < ROW_SIZE; row++) 00FD10EC sub edx,1 for (size_t row = 0; row < ROW_SIZE; row++) 00FD10EF jne sum_array_rows+10h (0FD10D0h) return sum; } 00FD10F1 paddq xmm1,xmm2 00FD10F5 movaps xmm0,xmm1 00FD10F8 psrldq xmm0,8 00FD10FD paddq xmm1,xmm0 00FD1101 movd eax,xmm1 00FD1105 psrldq xmm1,4 00FD110A movd edx,xmm1 00FD110E ret RET_TYPE sum_array_cols(const ARY_TYPE *a, const size_t step) { 00FD1110 push ebp 00FD1111 mov ebp,esp 00FD1113 sub esp,24h 00FD1116 push ebx 00FD1117 xorps xmm0,xmm0 RET_TYPE sum = 0; 00FD111A mov dword ptr [ebp-10h],258h 00FD1121 push esi 00FD1122 movlpd qword ptr [ebp-18h],xmm0 00FD1127 lea eax,[ecx+2580h] 00FD112D mov edx,dword ptr [ebp-14h] 00FD1130 push edi 00FD1131 mov edi,dword ptr [sum] 00FD1134 mov dword ptr [ebp-0Ch],eax 00FD1137 nop word ptr [eax+eax] for (size_t row = 0; row < ROW_SIZE; row++) 00FD1140 xorps xmm0,xmm0 00FD1143 mov dword ptr [ebp-8],0C8h 00FD114A movlpd qword ptr [sum],xmm0 00FD114F mov ecx,dword ptr [ebp-18h] 00FD1152 mov ebx,dword ptr [ebp-14h] 00FD1155 movlpd qword ptr [ebp-20h],xmm0 00FD115A mov esi,dword ptr [ebp-20h] 00FD115D mov dword ptr [ebp-4],ecx 00FD1160 mov ecx,dword ptr [ebp-1Ch] 00FD1163 nop dword ptr [eax] 00FD1167 nop word ptr [eax+eax] sum += static_cast<RET_TYPE>(a[row * step + col]); 00FD1170 add edi,dword ptr [eax-2580h] 00FD1176 mov dword ptr [sum],edi 00FD1179 adc edx,dword ptr [eax-257Ch] 00FD117F mov edi,dword ptr [ebp-4] 00FD1182 add edi,dword ptr [eax-12C0h] 00FD1188 mov dword ptr [ebp-4],edi 00FD118B adc ebx,dword ptr [eax-12BCh] 00FD1191 add esi,dword ptr [eax] 00FD1193 mov edi,dword ptr [sum] 00FD1196 adc ecx,dword ptr [eax+4] 00FD1199 lea eax,[eax+3840h] 00FD119F sub dword ptr [ebp-8],1 00FD11A3 jne sum_array_cols+60h (0FD1170h) for (size_t col = 0; col < COL_SIZE; col++) 00FD11A5 add esi,dword ptr [ebp-4] 00FD11A8 mov eax,dword ptr [ebp-0Ch] 00FD11AB adc ecx,ebx 00FD11AD add edi,esi 00FD11AF adc edx,ecx 00FD11B1 add eax,8 00FD11B4 sub dword ptr [ebp-10h],1 00FD11B8 mov dword ptr [ebp-0Ch],eax 00FD11BB jne sum_array_cols+30h (0FD1140h) return sum; 00FD11BD mov eax,edi } 00FD11BF pop edi 00FD11C0 pop esi 00FD11C1 pop ebx 00FD11C2 mov esp,ebp 00FD11C4 pop ebp 00FD11C5 ret ================ Optimisation ... /O2 SAME_TYPE ...... 0 RET_TYPE sum_array_rows(const ARY_TYPE *a, const size_t step) { 00A110C0 push ebp 00A110C1 mov ebp,esp 00A110C3 sub esp,24h 00A110C6 push ebx 00A110C7 xorps xmm0,xmm0 RET_TYPE sum = 0; 00A110CA mov dword ptr [ebp-0Ch],258h 00A110D1 movlpd qword ptr [ebp-18h],xmm0 00A110D6 mov edx,dword ptr [ebp-14h] 00A110D9 mov eax,dword ptr [sum] 00A110DC push esi 00A110DD push edi 00A110DE xchg ax,ax for (size_t col = 0; col < COL_SIZE; col++) 00A110E0 xorps xmm0,xmm0 00A110E3 mov dword ptr [ebp-8],0C8h 00A110EA movlpd qword ptr [sum],xmm0 00A110EF mov esi,dword ptr [ebp-18h] 00A110F2 mov ebx,dword ptr [ebp-14h] for (size_t col = 0; col < COL_SIZE; col++) 00A110F5 movlpd qword ptr [ebp-20h],xmm0 00A110FA mov edi,dword ptr [ebp-20h] 00A110FD mov dword ptr [ebp-4],esi 00A11100 mov esi,dword ptr [ebp-1Ch] sum += static_cast<RET_TYPE>(a[row * step + col]); 00A11103 add eax,dword ptr [ecx] 00A11105 adc edx,0 00A11108 mov dword ptr [sum],edx 00A1110B mov edx,dword ptr [ebp-4] 00A1110E add edx,dword ptr [ecx+4] 00A11111 mov dword ptr [ebp-4],edx 00A11114 mov edx,dword ptr [sum] 00A11117 adc ebx,0 00A1111A add edi,dword ptr [ecx+8] 00A1111D adc esi,0 00A11120 add ecx,0Ch 00A11123 sub dword ptr [ebp-8],1 00A11127 jne sum_array_rows+43h (0A11103h) for (size_t row = 0; row < ROW_SIZE; row++) 00A11129 add edi,dword ptr [ebp-4] 00A1112C adc esi,ebx 00A1112E add eax,edi 00A11130 adc edx,esi 00A11132 sub dword ptr [ebp-0Ch],1 00A11136 jne sum_array_rows+20h (0A110E0h) return sum; } 00A11138 pop edi 00A11139 pop esi 00A1113A pop ebx 00A1113B mov esp,ebp 00A1113D pop ebp 00A1113E ret RET_TYPE sum_array_cols(const ARY_TYPE *a, const size_t step) { 00A11140 push ebp 00A11141 mov ebp,esp 00A11143 sub esp,24h 00A11146 push ebx 00A11147 xorps xmm0,xmm0 RET_TYPE sum = 0; 00A1114A mov dword ptr [ebp-10h],258h 00A11151 push esi 00A11152 movlpd qword ptr [ebp-18h],xmm0 00A11157 lea eax,[ecx+12C0h] 00A1115D mov edx,dword ptr [ebp-14h] 00A11160 push edi 00A11161 mov edi,dword ptr [sum] 00A11164 mov dword ptr [ebp-0Ch],eax 00A11167 nop word ptr [eax+eax] for (size_t row = 0; row < ROW_SIZE; row++) 00A11170 xorps xmm0,xmm0 00A11173 mov dword ptr [ebp-8],0C8h 00A1117A movlpd qword ptr [sum],xmm0 00A1117F mov ecx,dword ptr [ebp-18h] 00A11182 mov ebx,dword ptr [ebp-14h] 00A11185 movlpd qword ptr [ebp-20h],xmm0 00A1118A mov esi,dword ptr [ebp-20h] 00A1118D mov dword ptr [ebp-4],ecx 00A11190 mov ecx,dword ptr [ebp-1Ch] 00A11193 nop dword ptr [eax] 00A11197 nop word ptr [eax+eax] sum += static_cast<RET_TYPE>(a[row * step + col]); 00A111A0 add edi,dword ptr [eax-12C0h] 00A111A6 lea eax,[eax+1C20h] 00A111AC adc edx,0 00A111AF mov dword ptr [sum],edx 00A111B2 mov edx,dword ptr [ebp-4] 00A111B5 add edx,dword ptr [eax-2580h] 00A111BB mov dword ptr [ebp-4],edx 00A111BE mov edx,dword ptr [sum] 00A111C1 adc ebx,0 00A111C4 add esi,dword ptr [eax-1C20h] 00A111CA adc ecx,0 00A111CD sub dword ptr [ebp-8],1 00A111D1 jne sum_array_cols+60h (0A111A0h) for (size_t col = 0; col < COL_SIZE; col++) 00A111D3 add esi,dword ptr [ebp-4] 00A111D6 mov eax,dword ptr [ebp-0Ch] 00A111D9 adc ecx,ebx 00A111DB add edi,esi 00A111DD adc edx,ecx 00A111DF add eax,4 00A111E2 sub dword ptr [ebp-10h],1 00A111E6 mov dword ptr [ebp-0Ch],eax 00A111E9 jne sum_array_cols+30h (0A11170h) return sum; 00A111EB mov eax,edi } 00A111ED pop edi } 00A111EE pop esi 00A111EF pop ebx 00A111F0 mov esp,ebp 00A111F2 pop ebp 00A111F3 ret