MSVC ++ 2015 - SSE compiler error or / undefined error in my program?

I came across some weird behavior while working on the SIMD color lerp function, and I trimmed it in a minimal program. The SIMD code in this example no longer performs lerp, but it decompresses from 32-bit color to the XMM register, and then back to 32-bit.

In MSVC ++ 2015 (Update 3) in Release x64 mode, the following code does not give the correct result, but in Debug x64 or Release / Debug x86 it works correctly. This is the only code in an empty Win32 C ++ console project otherwise:

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "emmintrin.h"

struct Color4
{
    uint8_t red;
    uint8_t green;
    uint8_t blue;
    uint8_t alpha;

    Color4(uint8_t red, uint8_t green, uint8_t blue, uint8_t alpha = 255)
        : red(red), green(green), blue(blue), alpha(alpha) {}

    explicit Color4(uint32_t rgba)
    {
        red = (uint8_t)(rgba & 0xFF);
        green = (uint8_t)((rgba >> 8)&0xFF);
        blue = (uint8_t)((rgba >> 16) & 0xFF);
        alpha = (uint8_t)((rgba >> 24) & 0xFF);
    }
};

Color4 PackUnpack(Color4 col)
{
    uint32_t tmp;

    memcpy(&tmp, &col, sizeof(tmp));

    __m128 aFloat = _mm_cvtepi32_ps(
        _mm_unpacklo_epi16(
            _mm_unpacklo_epi8(
                _mm_set1_epi32(tmp),
                _mm_setzero_si128()
            ),
            _mm_setzero_si128()
        )
    );

    __m128i ret = _mm_packus_epi16(
        _mm_packs_epi32(
            _mm_cvtps_epi32(aFloat),
            _mm_setzero_si128()
        ),
        _mm_setzero_si128()
    );

    return Color4((uint32_t)_mm_cvtsi128_si32(ret));
}

int main()
{
#ifdef _DEBUG
    printf("DEBUG\n");
#else
    printf("RELEASE\n");
#endif

    Color4 c = PackUnpack(Color4(32, 64, 128, 255));

    // Debug x64 or Debug/Release x86: Prints "32 64 128 255"
    // Release x64: Prints "255 0 0 0"
    printf("%d %d %d %d\n",  c.red, c.green, c.blue, c.alpha);

    return 0;
}

Output x64 output:

RELEASE
255 0 0 0

Debug x64 and all x86 output:

DEBUG
32 64 128 255

, XMM, _mm_set1_epi32 (. movdqa.)

main:
00007FF674391070  sub         rsp,38h  
00007FF674391074  lea         rcx,[string "RELEASE\n" (07FF674392200h)]  
00007FF67439107B  call        printf (07FF674391010h)  
00007FF674391080  movdqa      xmm0,xmmword ptr [__xmm@000000ff000000ff000000ff000000ff (07FF674392220h)]  
00007FF674391088  lea         rcx,[string "%d %d %d %d\n" (07FF674392210h)]  
00007FF67439108F  xorps       xmm2,xmm2  
00007FF674391092  mov         dword ptr [rsp+40h],0FF804020h  
00007FF67439109A  punpcklbw   xmm0,xmm2  
00007FF67439109E  punpcklwd   xmm0,xmm2  
00007FF6743910A2  cvtdq2ps    xmm0,xmm0  
00007FF6743910A5  cvtps2dq    xmm1,xmm0  
00007FF6743910A9  packssdw    xmm1,xmm2  
00007FF6743910AD  packuswb    xmm1,xmm2  
00007FF6743910B1  movd        r10d,xmm1  
00007FF6743910B6  mov         edx,r10d  
00007FF6743910B9  mov         r8d,r10d  
00007FF6743910BC  shr         edx,10h  
00007FF6743910BF  mov         eax,r10d  
00007FF6743910C2  shr         r8d,8  
00007FF6743910C6  movzx       r9d,dl  
00007FF6743910CA  shr         eax,18h  
00007FF6743910CD  movzx       edx,r10b  
00007FF6743910D1  movzx       r8d,r8b  
00007FF6743910D5  mov         dword ptr [rsp+20h],eax  
00007FF6743910D9  call        printf (07FF674391010h)  
00007FF6743910DE  xor         eax,eax  
00007FF6743910E0  add         rsp,38h  
00007FF6743910E4  ret  

g++ 4.8.4 Ubuntu 14.04 x64, -O3 ./.

, : - , undefined/ ?

(, punning union, uint32_t Color4, memcpy, ... .)

+4
2

, , , , :

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "emmintrin.h"

int main()
{
    uint8_t src[4] = { 32, 64, 128, 255 };

    uint32_t tmp = 0;
    memcpy( &tmp, &src, sizeof( tmp ) );    

    auto a = _mm_set1_epi32( tmp );

    printf( "tmp = 0x%08x\n", tmp );
    printf( "a.m128i_i32[0] = 0x%08x\n", a.m128i_i32[0] );  

    return 0;
}

:

tmp = 0xff804020
a.m128i_i32[0] = 0xff804020

x64:

tmp = 0xff804020
a.m128i_i32[0] = 0x000000ff
+2

.

tmp = color.red + 256 * (col.blue + 256 * (col.green + 256 * col.alpha)));

memcpy punning.

0

Source: https://habr.com/ru/post/1672280/


All Articles