I am trying to use Intel intrinsics to defeat optimized code compiler. Sometimes I can do it, sometimes I canβt.
I think the question is, why do I sometimes beat the compiler, but in other cases not? I got a time of 0.006 seconds for operator+= lower using Intel intrinsics (vs 0.009 when using bare C ++), but a time of 0.07 s for operator+ using intrinsics, while bare C ++ was only 0.03 with.
#include <windows.h> #include <stdio.h> #include <intrin.h> class Timer { LARGE_INTEGER startTime ; double fFreq ; public: Timer() { LARGE_INTEGER freq ; QueryPerformanceFrequency( &freq ) ; fFreq = (double)freq.QuadPart ; reset(); } void reset() { QueryPerformanceCounter( &startTime ) ; } double getTime() { LARGE_INTEGER endTime ; QueryPerformanceCounter( &endTime ) ; return ( endTime.QuadPart - startTime.QuadPart ) / fFreq ; // as double } } ; inline float randFloat(){ return (float)rand()/RAND_MAX ; } // Use my optimized code, #define OPTIMIZED_PLUS_EQUALS #define OPTIMIZED_PLUS union Vector { struct { float x,y,z,w ; } ; __m128 reg ; Vector():x(0.f),y(0.f),z(0.f),w(0.f) {} Vector( float ix, float iy, float iz, float iw ):x(ix),y(iy),z(iz),w(iw) {} //Vector( __m128 val ):x(val.m128_f32[0]),y(val.m128_f32[1]),z(val.m128_f32[2]),w(val.m128_f32[3]) {} Vector( __m128 val ):reg( val ) {} // 2x speed, above inline Vector& operator+=( const Vector& o ) { #ifdef OPTIMIZED_PLUS_EQUALS // YES! I beat it! Using this intrinsic is faster than just C++. reg = _mm_add_ps( reg, o.reg ) ; #else x+=ox, y+=oy, z+=oz, w+=ow ; #endif return *this ; } inline Vector operator+( const Vector& o ) { #ifdef OPTIMIZED_PLUS // This is slower return Vector( _mm_add_ps( reg, o.reg ) ) ; #else return Vector( x+ox, y+oy, z+oz, w+ow ) ; #endif } static Vector random(){ return Vector( randFloat(), randFloat(), randFloat(), randFloat() ) ; } void print() { printf( "%.2f %.2f %.2f\n", x,y,z,w ) ; } } ; int runs = 8000000 ; Vector sum ; // OPTIMIZED_PLUS_EQUALS (intrinsics) runs FASTER 0.006 intrinsics, vs 0.009 (std C++) void test1(){ for( int i = 0 ; i < runs ; i++ ) sum += Vector(1.f,0.25f,0.5f,0.5f) ;//Vector::random() ; } // OPTIMIZED* runs SLOWER (0.03 for reg.C++, vs 0.07 for intrinsics) void test2(){ float j = 27.f ; for( int i = 0 ; i < runs ; i++ ) { sum += Vector( j*i, i, i/j, i ) + Vector( i, 2*i*j, 3*i*j*j, 4*i ) ; } } int main() { Timer timer ; //test1() ; test2() ; printf( "Time: %f\n", timer.getTime() ) ; sum.print() ; }
Edit
Why am I doing this? The VS 2012 profiler tells me that my vector arithmetic operations may use some tweaking.
