MASM is superior to unoptimised.cpp but not unoptimised.c using VS

Question

MASM is superior to unoptimised.cpp but not unoptimised.c using VS

I have a very simple function that converts a vector (float *) using the main matrix of strings (float **):

int vector_by_matrix(float** m, float* v, float* out, int size)
{
    int i, j;
    float temp;

    if (!m || !v || !out) return -1;

    for (i = 0; i < size; i++)
    {
        temp = 0;

        for (j = 0; j < size; j++)
        {
                temp += m[i][j] * v[j];
        }


        //out[i] = temp * v[i]; MISTAKE DURING COPYING - SHOULD'VE BEEN...
        out[i] = temp;``
    }

    return 0;
}

++ (x64) Visual Studio (2013) ++; ( / , c. size = 10000). (O2) , , (x20). , .c C VS - . ( ++) . , .

, C (/). C (/++), - MASM, , . VS C? , , . MASM, :

 mul_vector_by_martix proc

    mov r10, r9

    sub rsp, 8

    mov qword ptr[rsp], r11

    LI:
        MOV rbx, qword ptr[r10*8+rcx[0]-8]

        XORPS xmm0, xmm0

        mov r11, r9

        LJ:

            MOVSS xmm1, dword ptr[r11*4+rbx[0]-4]
            MULSS xmm1, dword ptr[r11*4+rdx[0]-4]
            ADDSS xmm0, xmm1

            sub r11, 1

        jnz LJ

        MOVSS dword ptr[r10*4+r8[0]-4], xmm0

        sub r10, 1
    jnz LI

    mov r11, qword ptr[rsp]

    add rsp, 8

    ret

mul_vector_by_martix endp

- ;)

.

Update

, . ( , 4, , , ):

mul_opt_vector_by_martix proc

    sub rsp, 8
    mov qword ptr[rsp], r12
    sub rsp, 8
    mov qword ptr[rsp], r13 

    ; copy rdx for arithmetic operations
    mov r10, rdx

    ; init static global
    mov r12, LSTEP

    cmp VSIZE, r9
    je LOOPS

    ; get sizeof(vector)
    mov rax, 4
    mul r9
    mov r12, rax

    ; get the number of steps in inner loop
    mov r11, 16
    mov rax, r12
    div r11

    mov r11, rax

    mov r12, r11

    mov rax, 16
    mul r12
    mov r12, rax
    sub r12, 16

    mov VSIZE, r9
    mov LSTEP, r12

LOOPS:

    LI:

        MOV rbx, qword ptr[r9*8+rcx[0]-8]

        XORPS xmm0, xmm0

        mov r13, r12

        LJ:

            MOVAPS xmm1, xmmword ptr[r13+rbx[0]]
            MULPS xmm1, xmmword ptr[r13+r10[0]]

            ; add the packed single floating point numbers together
            MOVHLPS xmm2, xmm1
            ADDPS xmm2, xmm1
            MOVAPS xmm1, xmm2
            SHUFPS xmm2, xmm2, 1 ; imm8 = 00 00 00 01
            ADDSS xmm2, xmm1
            ADDSS xmm0, xmm2

            sub r13, 16

        cmp r13, 0
        JGE LJ

        MOVSS dword ptr[r9*4+r8[0]-4], xmm0

        sub r9, 1
    jnz LI

    mov r13, qword ptr[rsp]
    add rsp, 8
    mov r12, qword ptr[rsp]
    add rsp, 8

    ret

mul_opt_vector_by_martix endp

20-30%, C. :

                sum += v[j] * m[i][j];
 movsxd      rax,r8d  
 add         rdx,8  
 movups      xmm0,xmmword ptr [rbx+rax*4]  
 movups      xmm1,xmmword ptr [r10+rax*4]  
 lea         eax,[r8+4]  
 movsxd      rcx,eax  
 add         r8d,8  
 mulps       xmm1,xmm0  
 movups      xmm0,xmmword ptr [rbx+rcx*4]  
 addps       xmm2,xmm1  
 movups      xmm1,xmmword ptr [r10+rcx*4]  
 mulps       xmm1,xmm0  
 addps       xmm3,xmm1  
 cmp         r8d,r9d  
 jl          vector_by_matrix+90h (07FEDD321440h)  
 addps       xmm2,xmm3  
 movaps      xmm1,xmm2  
 movhlps     xmm1,xmm2 
addps       xmm1,xmm2
movaps      xmm0,xmm1  
 shufps      xmm0,xmm1,0F5h  
 addss       xmm1,xmm0

, , . ++, , , , ++ , C VS. , Frankie_C . , - , - ; C, . , , , , , - .

2 ============================================================================================ ========================

, , , , . , , . , - . , - "" 4 ( ).

LOOPS:

    LI:

        MOV rbx, qword ptr[r9*8+rcx[0]-8]

        XORPS xmm0, xmm0

        mov r13, r12

        LJ:

            MOVAPS xmm1, xmmword ptr[r13+rbx[0]]
            MULPS xmm1, xmmword ptr[r13+r10[0]]

            ; just add and accrue
            ADDPS xmm0, xmm1

            sub r13, 16

        cmp r13, 0
        jge LJ

        ;------------ moved this block to the outside --------------;

        ; add the packed single floating point numbers together
        MOVHLPS xmm1, xmm0
        ADDPS xmm1, xmm0
        MOVAPS xmm0, xmm1
        SHUFPS xmm1, xmm1, 1 ; imm8 = 00 00 00 01
        ADDSS xmm0, xmm1

        ;--------------------end block---------------------------

        MOVSS dword ptr[r9*4+r8[0]-4], xmm0

        sub r9, 1
    jnz LI

- , . , VS-, C - ( ) , gcc. , SIMD, xmm regsiters. , , , .

+4

c++ performance optimization c visual-studio-2012

cdcdcd 16 . '16 23:30

1

sehe · Answer 1 · 2016-01-17T00:33:44+0000

, .

, clang, , , , vector_by_matrix.

#include <algorithm>
#include <numeric>

int main() {
    using namespace std;

    auto constexpr N = 512;
    float* m[N];
    generate_n(m, N, []{return new float[N];});

    float v[N], out[N];

    float start = 0.0;
    for(auto& col : m) iota(col, col+N, start += 0.1);
    iota(begin(v), end(v), -1.0f);

    //vector_by_matrix(m, v, out, N);

    for_each(begin(m), end(m), [](float*p) { delete[] p; });
}

, , .

, , . (, vector_by_matrix , :)).

, - , , , , , .

. :

int vector_by_matrix(float** m, float *const v, float *out, int size) {
    int i, j;
    float temp;

    if (!m || !v || !out)
        return -1;

    for (i = 0; i < size; i++) {
        temp = 0;

        for (j = 0; j < size; j++) {
            temp += m[i][j] * v[j];
        }

        out[i] = temp * v[i];
    }

    return 0;
}

#include <algorithm>
#include <numeric>

int main() {
    using namespace std;

    auto constexpr N = 512;
    float* m[N];
    generate_n(m, N, []{return new float[N];});

    float v[N], out[N];

    float start = 0.0;
    for(auto& col : m) iota(col, col+N, start += 0.1);
    iota(begin(v), end(v), -1.0f);

    vector_by_matrix(m, v, out, N); // NO DIFFERENCE IF COMMENTED

    for_each(begin(m), end(m), [](float*p) { delete[] p; });
}

MASM is superior to unoptimised.cpp but not unoptimised.c using VS

Update

More articles: