Memcpy moves 128 bit in linux

Question

Memcpy moves 128 bit in linux

I am writing a Linux device driver for a PCIe device. This device driver performs several read and write operations to verify throughput. When I use memcpy, the maximum payload for TLP is 8 bytes (on 64-bit architectures). In my opinion, the only way to get a payload of 16 bytes is to use the SSE instruction set. I already saw this , but the code does not compile (AT & T / Intel syntax problem).

Is there any way to use this code inside linux?
Does anyone know where I can find a memcpy implementation that moves 128 bits?

+3

c assembly linux sse simd

haster8558 Nov 30 '15 at 16:14

source share

3 answers

skyking · Answer 1 · 2015-11-30T17:07:50+0000

, , , GCC , asm . ( - , ).

-, , , AT & T .

GCC extended asm C.

-, , , , , asm ( ). GCC, vector_size:

typedef float v4sf __attribute__((vector_size(16)));

void fubar( v4sf *p, v4sf* q )
{
  v4sf p0 = *p++;
  v4sf p1 = *p++;
  v4sf p2 = *p++;
  v4sf p3 = *p++;

  *q++ = p0;
  *q++ = p1;
  *q++ = p2;
  *q++ = p3;
}

, , , mmx, , , 128- ( ).

-, , memcpy. memcpy .

- , Linux, , . SSE .

-, , , . / ( ?) / .

-, , , DMA . , , , ( 100%).

Peter Cordes · Answer 2 · 2015-12-01T03:02:14+0000

, , OP 16B. Linux 8B PCIe.

MMIO movnti --. movnti GP, .

, intrinsics, #include <immintrin.h> . , , . - .

.

( rep movs ), Linux memcpy . , rep movsq rep movsb .

, memcpy , rep movsl ( AT & T rep movsd), : rep movsw movsb . ( , , IMO, , rep movsb , .)

Intel P6 rep movs. .

, memcpy 64- , , .

, , , Linux memcpy, , - .

DMA. , , . ( , .)

SSE / . RAID5/RAID6. , , vector/FPU .

Linux memcpy , -, . memcpy SSE, Andi Kleen Ingo Molnar , SSE memcpy. , memcpy, .

SSE , kernel_fpu_begin() kernel_fpu_end(). Linux 3.7 kernel_fpu_end() FPU, fpu_begin/fpu_end . , kernel_fpu_begin , " -, " .

, xmm0, . , SSE, AVX, ymm0/zmm0. , ymm regs, AVX + SSE. , vzeroupper. AVX...

, / , kernel_fpu_begin, . ( ), task_struct.thread.fpu, , , , FPU ' t . , , , , , . , , , kernel_fpu_begin()/kernel_fpu_end() FPU XSAVE/XRSTOR.

Z boson · Answer 3 · 2015-11-30T20:49:19+0000

, . , . , .

, , : intrinsics.

, GCC, , . - 32- 64- . Intrinsics .

GCC, Clang, ICC MSVC 32-, 64- .

#include "xmmintrin.h"
void X_aligned_memcpy_sse2(char* dest, const char* src, const unsigned long size)
{
    for(int i=size/128; i>0; i--) {
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
        _mm_prefetch(src + 128, _MM_HINT_NTA);
        _mm_prefetch(src + 160, _MM_HINT_NTA);
        _mm_prefetch(src + 194, _MM_HINT_NTA);
        _mm_prefetch(src + 224, _MM_HINT_NTA);

        xmm0 = _mm_load_si128((__m128i*)&src[   0]);
        xmm1 = _mm_load_si128((__m128i*)&src[  16]);
        xmm2 = _mm_load_si128((__m128i*)&src[  32]);
        xmm3 = _mm_load_si128((__m128i*)&src[  48]);
        xmm4 = _mm_load_si128((__m128i*)&src[  64]);
        xmm5 = _mm_load_si128((__m128i*)&src[  80]);
        xmm6 = _mm_load_si128((__m128i*)&src[  96]);
        xmm7 = _mm_load_si128((__m128i*)&src[ 112]);

        _mm_stream_si128((__m128i*)&dest[   0], xmm0);
        _mm_stream_si128((__m128i*)&dest[  16], xmm1);
        _mm_stream_si128((__m128i*)&dest[  32], xmm2);
        _mm_stream_si128((__m128i*)&dest[  48], xmm3);
        _mm_stream_si128((__m128i*)&dest[  64], xmm4);
        _mm_stream_si128((__m128i*)&dest[  80], xmm5);
        _mm_stream_si128((__m128i*)&dest[  96], xmm6);
        _mm_stream_si128((__m128i*)&dest[ 112], xmm7);
        src  += 128;
        dest += 128;
    }
}

, src dest 16 size 128.

. , , , .

void copy(char *x, char *y, int n)
{
    #pragma omp parallel for schedule(static)
    for(int i=0; i<n/16; i++) {
        _mm_stream_ps((float*)&y[16*i], _mm_load_ps((float*)&x[16*i]));
    }
}

, .

Here is the assembly of the functions X_aligned_memcpy_sse2using the built-in functions GCC -O3 -S -masm=intel. Note that this is essentially the same as here .

    shr rdx, 7
    test    edx, edx
    mov eax, edx
    jle .L1
.L5:
    sub rsi, -128
    movdqa  xmm6, XMMWORD PTR [rsi-112]
    prefetchnta [rsi]
    prefetchnta [rsi+32]
    prefetchnta [rsi+66]
    movdqa  xmm5, XMMWORD PTR [rsi-96]
    prefetchnta [rsi+96]
    sub rdi, -128
    movdqa  xmm4, XMMWORD PTR [rsi-80]
    movdqa  xmm3, XMMWORD PTR [rsi-64]
    movdqa  xmm2, XMMWORD PTR [rsi-48]
    movdqa  xmm1, XMMWORD PTR [rsi-32]
    movdqa  xmm0, XMMWORD PTR [rsi-16]
    movdqa  xmm7, XMMWORD PTR [rsi-128]
    movntdq XMMWORD PTR [rdi-112], xmm6
    movntdq XMMWORD PTR [rdi-96], xmm5
    movntdq XMMWORD PTR [rdi-80], xmm4
    movntdq XMMWORD PTR [rdi-64], xmm3
    movntdq XMMWORD PTR [rdi-48], xmm2
    movntdq XMMWORD PTR [rdi-128], xmm7
    movntdq XMMWORD PTR [rdi-32], xmm1
    movntdq XMMWORD PTR [rdi-16], xmm0
    sub eax, 1
    jne .L5
.L1:
    rep ret

Memcpy moves 128 bit in linux

More articles: