Using Intel's built-in assembler to add bigint add with carry

I would like to make a quick code to add 64-bit numbers in large ints:

uint64_t ans[n];
uint64_t a[n], b[n]; // assume initialized values....
for (int i = 0; i < n; i++)
  ans[i] = a[i] + b[i];

but the above does not work with hyphenation.

I saw another question that suggested using an if statement to check what is elegant:

ans[0] = a[0] + b[0];
int c = ans[0] < a[0];
for (int i = 0; i < n; i++) {
  ans[i] = a[i] + b[i] + c;
  c = ans[i] < a[i];
}

However, I would like to learn how to embed the built-in assembly (intel) and make it faster. I am sure there are 64 bit opcodes, equivalent:

add eax, ebx
adc ...

but I don't know how to pass parameters to assembler from the rest of the C ++ code.

+3
source share
2 answers

but the above does not work with hyphenation.

, GCC , ADC, , .

. , , , .

void Test(uint64_t* a, uint64_t* b, uint64_t* ans, int n)
{
    for (int i = 0; i < n; ++i)
    {
        ans[i] = a[i] + b[i];
    }
}

, GCC , .

Godbolt , C (, , ; , ). - , 64- . GCC SSE2 . , MOVDQU ( XMM), PADDQ ( ) MOVQ ( XMM ). , MOVDQU , 64- , PADDQ , MOVQ .

, , , GCC for. (-fno-tree-vectorize), , , , . (, . MOVQ , , , MOVDQU.)

, SSE2 (-mno-sse2), , . , SSE2, x86 64- , - ADD + ADC.

, , . , GCC , , -O2 -O3. -O1 ADD + ADC. , . (, , GCC ). . , .)

, Clang , GCC .


, , . , , , GCC ADC.

, x86-32. x86-64, 64- , ""; ADD , . , "bigint" 32- , x86-32 .

Ped7g , , , ADD + ADC. , , , , , , ADC. . , . -, , , .

( , c, , . , GCC XOR 64- CDQ , , .)

( , , GCC c . , . C, GCC , .)


, (intel) .

, , , ADC. , , !

, , , , , . , , , . , , , , C.

, -, , GCC. ; . , , . , "extended asm" , , ++ ".

, . , , ADC. - - , C, Right Thing ™.

_addcarry_u32 _addcarry_u64 intrinsics. ADCX ADOX. ADC, . Intel ADX, Broadwell. , Broadwell , ADCX ADOX . , , . , , , .


, 64- , : ADD + ADC

64- ADD ADC ( ADCX ADOX), 64- . 128- "bigint", .

x86-32 64- . SSE2, GCC Clang.

+2

, , , (, ), ADC .

C++ for; asm, , CF . (GCC6 , ; FLAGS asm , gcc, , setc/cmp, .)

#include <cstdint>
#include <iostream>

#define N 4

int main(int argc, char *argv[]) {

  uint64_t ans[N];
  const uint64_t a[N] = {UINT64_MAX, UINT64_MAX, 0, 0};
  const uint64_t b[N] = {2, 1, 3, 1};

  const uint64_t i = N;
  asm volatile (
      "xor %%eax, %%eax\n\t"      // i=0  and clear CF
      "mov %3, %%rdi\n\t"         // N

      ".L_loop:\n\t"

      "mov (%%rax,%1), %%rdx\n\t" // rdx = a[i]

      "adc (%%rax,%2), %%rdx\n\t" // rdx += b[i] + carry

      "mov %%rdx, (%%rax, %0)\n\t"// ans[i] = a[i] + b[i]

      "lea 8(%%rax), %%rax\n\t"   // i += 8 bytes

      "dec %%rdi\n\t"             // --i

      "jnz .L_loop\n\t"   // if (rdi == 0) goto .L_loop;
      : /* Outputs (none) */
      : /* Inputs */ "r" (ans), "r" (a), "r" (b), "r" (i)
      : /* Clobbered */ "%rax", "%rbx", "%rdx", "%rdi", "memory"
  );

  // SHOULD OUTPUT 1 1 4 1
  for (int i = 0; i < N; ++i)
    std::cout << ans[i] << std::endl;

  return 0;
}

carry flag (CF), 0, CMP. DEC carry flag, . , , %rdi , inc %rax.

volatile "memory" , , .

, Core2/Nehalem, adc inc . . ADC/SBB INC/DEC . .

: @PeterCordes, inc %rax 8 ( , ). lea 8(%rax), %rax.


: , , 0 inc/jnz.

( 4. , -i . .)

// untested
  asm volatile (
      "mov   $-3, %[idx]\n\t"        // i=-3   (which we will scale by 8)

      "mov   (%[a]), %%rdx  \n\t"
      "add   (%[b]), %%rdx  \n\t"    // peel the first iteration so we don't have to zero CF first, and ADD is faster on some CPUs.
      "mov    %%rdx, (%0) \n\t"

      ".L_loop:\n\t"                        // do{
      "mov    8*4(%[a], %[idx], 8), %%rdx\n\t"   // rdx = a[i + len]
      "adc    8*4(%[b], %[idx], 8), %%rdx\n\t"   // rdx += b[i + len] + carry
      "mov    %%rdx,  8*4(%[ans], %[idx], 8)\n\t"  // ans[i] = rdx

      "inc    %[idx]\n\t"
      "jnz    .L_loop\n\t"                  // }while (++i);

      : /* Outputs, actually a read-write input */ [idx] "+&r" (i)
      : /* Inputs */ [ans] "r" (ans), [a] "r" (a), [b] "r" (b)
      : /* Clobbered */ "rdx", "memory"
  );

, , %%= , GCC , , 1:

-i ndex , (2 ), . adc , , ans .

LEA 8, CF. , Haswell AGU 7, Sandybridge/Ivybridge 2 . Intel SnB , 2x + 1x . . Micro fusion

Intel (Core2/Nehalem) , .

AMD, , . Agner Fog .

, , AMD Intel.

0

Source: https://habr.com/ru/post/1663289/


All Articles