Using the pxor command before an SSE cvtsi2ss statement

Question

Using the pxor command before an SSE cvtsi2ss statement

I am currently writing various options for converting color to a black and white image converter. I'd like to do:

Simple C ++ implementation
Standalone ASM Implementation
Standalone ASM implementation with AVX vector instructions.

The goal is to compare each of them and analyze the performance improvement that I get.

The following code snippet is a C ++ implementation. It only processes one part of the image, because I also want to do multi-threaded calculations.

void CBwConverter::run(const CImg<uint8_t> &src, CImg<uint8_t> &dst, uint32_t pixel, size_t size) const {
    const uint8_t *rC = src.data(0,pixel,0,0);
    const uint8_t *gC = src.data(0,pixel,0,1);
    const uint8_t *bC = src.data(0,pixel,0,2);
    uint8_t *mC = dst.data(0,pixel,0,0);

    for(size_t c = 0; c < size; c++, rC++, gC++, bC++, mC++) {
        *mC = (uint8_t)(0.299f*(*rC) + 0.587f*(*gC) + 0.114f*(*bC));
    }
}

Now, before launching the ASM version, I had compiled and parsed the C ++ code to see how it looked. After compiling with gcc -std=c++11 -g -O2 -c CBwConverter.ccI got the following result with objdump -d CBwConvert.o:

0000000000000000 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm>:
   0:   53                      push   %rbx
   1:   8b 3e                   mov    (%rsi),%edi
   3:   89 c8                   mov    %ecx,%eax
   5:   44 8b 56 04             mov    0x4(%rsi),%r10d
   9:   44 8b 5e 08             mov    0x8(%rsi),%r11d
   d:   89 c9                   mov    %ecx,%ecx
   f:   48 8b 5e 18             mov    0x18(%rsi),%rbx
  13:   0f af c7                imul   %edi,%eax
  16:   4c 0f af d7             imul   %rdi,%r10
  1a:   4b 8d 34 1b             lea    (%r11,%r11,1),%rsi
  1e:   4c 8d 0c 03             lea    (%rbx,%rax,1),%r9
  22:   4c 89 d7                mov    %r10,%rdi
  25:   49 0f af fb             imul   %r11,%rdi
  29:   4c 0f af d6             imul   %rsi,%r10
  2d:   48 01 c7                add    %rax,%rdi
  30:   4c 01 d0                add    %r10,%rax
  33:   48 01 df                add    %rbx,%rdi
  36:   48 8d 34 03             lea    (%rbx,%rax,1),%rsi
  3a:   8b 02                   mov    (%rdx),%eax
  3c:   48 0f af c8             imul   %rax,%rcx
  40:   48 03 4a 18             add    0x18(%rdx),%rcx
  44:   4d 85 c0                test   %r8,%r8
  47:   74 6b                   je     b4 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0xb4>
  49:   31 d2                   xor    %edx,%edx
  4b:   f3 0f 10 25 00 00 00    movss  0x0(%rip),%xmm4        # 53 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0x53>
  52:   00 
  53:   f3 0f 10 1d 00 00 00    movss  0x0(%rip),%xmm3        # 5b <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0x5b>
  5a:   00 
  5b:   f3 0f 10 15 00 00 00    movss  0x0(%rip),%xmm2        # 63 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0x63>
  62:   00 
  63:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  68:   41 0f b6 04 11          movzbl (%r9,%rdx,1),%eax
  6d:   66 0f ef c0             pxor   %xmm0,%xmm0
  71:   f3 0f 2a c0             cvtsi2ss %eax,%xmm0
  75:   0f b6 04 17             movzbl (%rdi,%rdx,1),%eax
  79:   0f 28 c8                movaps %xmm0,%xmm1
  7c:   66 0f ef c0             pxor   %xmm0,%xmm0
  80:   f3 0f 59 cc             mulss  %xmm4,%xmm1
  84:   f3 0f 2a c0             cvtsi2ss %eax,%xmm0
  88:   0f b6 04 16             movzbl (%rsi,%rdx,1),%eax
  8c:   f3 0f 59 c3             mulss  %xmm3,%xmm0
  90:   f3 0f 58 c1             addss  %xmm1,%xmm0
  94:   66 0f ef c9             pxor   %xmm1,%xmm1
  98:   f3 0f 2a c8             cvtsi2ss %eax,%xmm1
  9c:   f3 0f 59 ca             mulss  %xmm2,%xmm1
  a0:   f3 0f 58 c1             addss  %xmm1,%xmm0
  a4:   f3 0f 2c c0             cvttss2si %xmm0,%eax
  a8:   88 04 11                mov    %al,(%rcx,%rdx,1)
  ab:   48 83 c2 01             add    $0x1,%rdx
  af:   49 39 d0                cmp    %rdx,%r8
  b2:   75 b4                   jne    68 <_ZNK12CBwConverter3runERKN12cimg_library4CImgIhEERS2_jm+0x68>
  b4:   5b                      pop    %rbx
  b5:   c3                      retq