- SSSE3 pshufb, .
; SSSE3
pshufb xmm0, xmm1 ; where xmm1 is zeroed, e.g. with pxor xmm1,xmm1
:
; SSE2 only
punpcklbw xmm0, xmm0 ; xxxxxxxxABCDEFGH -> xxxxxxxxEEFFGGHH
pshuflw xmm0, xmm0, 0 ; xxxxxxxxEEFFGGHH -> xxxxxxxxHHHHHHHH
punpcklqdq xmm0, xmm0 ; xxxxxxxxHHHHHHHH -> HHHHHHHHHHHHHHHH
, punpckl bw/wd → pshufd xmm0, xmm0, 0, 64- . ( Merom K8). pshuflw , punpcklqdq, pshufd punpck 64 . , " ", 3 bw/wd/pshufd.
3 , . . http://agner.org/optimize/ .
, pshuflw.
, 0x01010101, 4 . .
; movzx eax, whatever
imul edx, eax, 0x01010101 ; edx = al repeated 4 times
movd xmm0, eax
pshufd xmm0, xmm0, 0
, imul , 32- , , 32 .
, , , . movd xmm. (, , pinsrb, , , , , , movd .)
If bandwidth for teams is more a problem than latency, you should consider pmuludqif you cannot use it pshufb, even if it has 5 cycles on most processors.
; low 32 bits of xmm0 = your byte, **zero extended**
pmuludq xmm0, xmm7 ; xmm7 = 0x01010101 in the low 32 bits
pshufd xmm0, xmm0, 0