I came across some weird behavior while working on the SIMD color lerp function, and I trimmed it in a minimal program. The SIMD code in this example no longer performs lerp, but it decompresses from 32-bit color to the XMM register, and then back to 32-bit.
In MSVC ++ 2015 (Update 3) in Release x64 mode, the following code does not give the correct result, but in Debug x64 or Release / Debug x86 it works correctly. This is the only code in an empty Win32 C ++ console project otherwise:
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "emmintrin.h"
struct Color4
{
uint8_t red;
uint8_t green;
uint8_t blue;
uint8_t alpha;
Color4(uint8_t red, uint8_t green, uint8_t blue, uint8_t alpha = 255)
: red(red), green(green), blue(blue), alpha(alpha) {}
explicit Color4(uint32_t rgba)
{
red = (uint8_t)(rgba & 0xFF);
green = (uint8_t)((rgba >> 8)&0xFF);
blue = (uint8_t)((rgba >> 16) & 0xFF);
alpha = (uint8_t)((rgba >> 24) & 0xFF);
}
};
Color4 PackUnpack(Color4 col)
{
uint32_t tmp;
memcpy(&tmp, &col, sizeof(tmp));
__m128 aFloat = _mm_cvtepi32_ps(
_mm_unpacklo_epi16(
_mm_unpacklo_epi8(
_mm_set1_epi32(tmp),
_mm_setzero_si128()
),
_mm_setzero_si128()
)
);
__m128i ret = _mm_packus_epi16(
_mm_packs_epi32(
_mm_cvtps_epi32(aFloat),
_mm_setzero_si128()
),
_mm_setzero_si128()
);
return Color4((uint32_t)_mm_cvtsi128_si32(ret));
}
int main()
{
#ifdef _DEBUG
printf("DEBUG\n");
#else
printf("RELEASE\n");
#endif
Color4 c = PackUnpack(Color4(32, 64, 128, 255));
printf("%d %d %d %d\n", c.red, c.green, c.blue, c.alpha);
return 0;
}
Output x64 output:
RELEASE
255 0 0 0
Debug x64 and all x86 output:
DEBUG
32 64 128 255
, XMM, _mm_set1_epi32 (. movdqa.)
main:
00007FF674391070 sub rsp,38h
00007FF674391074 lea rcx,[string "RELEASE\n" (07FF674392200h)]
00007FF67439107B call printf (07FF674391010h)
00007FF674391080 movdqa xmm0,xmmword ptr [__xmm@000000ff000000ff000000ff000000ff (07FF674392220h)]
00007FF674391088 lea rcx,[string "%d %d %d %d\n" (07FF674392210h)]
00007FF67439108F xorps xmm2,xmm2
00007FF674391092 mov dword ptr [rsp+40h],0FF804020h
00007FF67439109A punpcklbw xmm0,xmm2
00007FF67439109E punpcklwd xmm0,xmm2
00007FF6743910A2 cvtdq2ps xmm0,xmm0
00007FF6743910A5 cvtps2dq xmm1,xmm0
00007FF6743910A9 packssdw xmm1,xmm2
00007FF6743910AD packuswb xmm1,xmm2
00007FF6743910B1 movd r10d,xmm1
00007FF6743910B6 mov edx,r10d
00007FF6743910B9 mov r8d,r10d
00007FF6743910BC shr edx,10h
00007FF6743910BF mov eax,r10d
00007FF6743910C2 shr r8d,8
00007FF6743910C6 movzx r9d,dl
00007FF6743910CA shr eax,18h
00007FF6743910CD movzx edx,r10b
00007FF6743910D1 movzx r8d,r8b
00007FF6743910D5 mov dword ptr [rsp+20h],eax
00007FF6743910D9 call printf (07FF674391010h)
00007FF6743910DE xor eax,eax
00007FF6743910E0 add rsp,38h
00007FF6743910E4 ret
g++ 4.8.4 Ubuntu 14.04 x64, -O3 ./.
, : - , undefined/ ?
(, punning union, uint32_t Color4, memcpy, ... .)